#ifndef CUFFTDX_FFT_125_FP32_FWD_PTX_HPP
#define CUFFTDX_FFT_125_FP32_FWD_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<159, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1427>;
.reg .b32 r<14>;
.reg .b64 rd<10>;
mov.u32 r12, %tid.y;
mov.u32 r13, %50;
mad.lo.s32 r3, r12, 1000, r13;
add.f32 f101, %62, %92;
add.f32 f103, %72, %82;
add.f32 f1426, %52, f101;
add.f32 f104, f103, f1426;
add.f32 f105, %102, %104;
add.f32 f107, %103, %83;
add.f32 f1422, %53, f105;
add.f32 f108, f107, f1422;
mul.f32 f110, f103, 0f3F4F1BBD;
fma.rn.f32 f1421, f101, 0f3E9E377A, %52;
sub.f32 f111, f1421, f110;
sub.f32 f112, %102, %104;
sub.f32 f114, %103, %83;
mul.f32 f1419, f112, 0f3F737871;
mul.f32 f1420, f114, 0fBF167918;
sub.f32 f116, f1420, f1419;
sub.f32 f117, f111, f116;
add.f32 f118, f116, f111;
mul.f32 f119, f101, 0f3F4F1BBD;
sub.f32 f120, %52, f119;
fma.rn.f32 f121, f103, 0f3E9E377A, f120;
mul.f32 f122, f112, 0f3F167918;
mul.f32 f123, f114, 0f3F737871;
sub.f32 f124, f123, f122;
sub.f32 f125, f121, f124;
add.f32 f126, f124, f121;
fma.rn.f32 f1417, f105, 0f3E9E377A, %53;
mul.f32 f1418, f107, 0f3F4F1BBD;
sub.f32 f129, f1417, f1418;
sub.f32 f130, %62, %92;
sub.f32 f132, %72, %82;
mul.f32 f1415, f130, 0f3F737871;
mul.f32 f1416, f132, 0fBF167918;
sub.f32 f134, f1416, f1415;
add.f32 f135, f134, f129;
sub.f32 f136, f129, f134;
mul.f32 f137, f105, 0f3F4F1BBD;
sub.f32 f138, %53, f137;
fma.rn.f32 f139, f107, 0f3E9E377A, f138;
mul.f32 f140, f130, 0f3F167918;
mul.f32 f141, f132, 0f3F737871;
sub.f32 f142, f141, f140;
add.f32 f143, f142, f139;
sub.f32 f144, f139, f142;
add.f32 f145, %64, %94;
add.f32 f147, %74, %84;
add.f32 f1414, %54, f145;
add.f32 f148, f147, f1414;
add.f32 f149, %65, %95;
add.f32 f151, %107, %105;
add.f32 f1410, %106, f149;
add.f32 f152, f151, f1410;
fma.rn.f32 f1408, f145, 0f3E9E377A, %54;
mul.f32 f1409, f147, 0f3F4F1BBD;
sub.f32 f155, f1408, f1409;
sub.f32 f156, %65, %95;
sub.f32 f158, %107, %105;
mul.f32 f1406, f156, 0f3F737871;
mul.f32 f1407, f158, 0fBF167918;
sub.f32 f160, f1407, f1406;
sub.f32 f161, f155, f160;
add.f32 f162, f160, f155;
mul.f32 f163, f145, 0f3F4F1BBD;
sub.f32 f164, %54, f163;
fma.rn.f32 f165, f147, 0f3E9E377A, f164;
mul.f32 f166, f156, 0f3F167918;
mul.f32 f167, f158, 0f3F737871;
sub.f32 f168, f167, f166;
sub.f32 f169, f165, f168;
add.f32 f170, f168, f165;
mul.f32 f172, f151, 0f3F4F1BBD;
fma.rn.f32 f1405, f149, 0f3E9E377A, %106;
sub.f32 f173, f1405, f172;
sub.f32 f174, %64, %94;
sub.f32 f176, %74, %84;
mul.f32 f177, f176, 0fBF167918;
mul.f32 f1404, f174, 0f3F737871;
sub.f32 f178, f177, f1404;
add.f32 f179, f178, f173;
sub.f32 f180, f173, f178;
mul.f32 f181, f149, 0f3F4F1BBD;
sub.f32 f182, %106, f181;
fma.rn.f32 f183, f151, 0f3E9E377A, f182;
mul.f32 f184, f174, 0f3F167918;
mul.f32 f185, f176, 0f3F737871;
sub.f32 f186, f185, f184;
add.f32 f187, f186, f183;
sub.f32 f188, f183, f186;
add.f32 f189, %66, %96;
add.f32 f191, %76, %86;
add.f32 f1403, %56, f189;
add.f32 f192, f191, f1403;
add.f32 f193, %110, %109;
add.f32 f195, %77, %111;
add.f32 f1398, %108, f193;
add.f32 f196, f195, f1398;
mul.f32 f198, f191, 0f3F4F1BBD;
fma.rn.f32 f1397, f189, 0f3E9E377A, %56;
sub.f32 f199, f1397, f198;
sub.f32 f200, %110, %109;
sub.f32 f202, %77, %111;
mul.f32 f203, f202, 0fBF167918;
mul.f32 f1396, f200, 0f3F737871;
sub.f32 f204, f203, f1396;
sub.f32 f205, f199, f204;
add.f32 f206, f204, f199;
mul.f32 f207, f189, 0f3F4F1BBD;
sub.f32 f208, %56, f207;
fma.rn.f32 f209, f191, 0f3E9E377A, f208;
mul.f32 f210, f200, 0f3F167918;
mul.f32 f211, f202, 0f3F737871;
sub.f32 f212, f211, f210;
sub.f32 f213, f209, f212;
add.f32 f214, f212, f209;
fma.rn.f32 f1394, f193, 0f3E9E377A, %108;
mul.f32 f1395, f195, 0f3F4F1BBD;
sub.f32 f217, f1394, f1395;
sub.f32 f218, %66, %96;
sub.f32 f220, %76, %86;
mul.f32 f1392, f218, 0f3F737871;
mul.f32 f1393, f220, 0fBF167918;
sub.f32 f222, f1393, f1392;
add.f32 f223, f222, f217;
sub.f32 f224, f217, f222;
mul.f32 f225, f193, 0f3F4F1BBD;
sub.f32 f226, %108, f225;
fma.rn.f32 f227, f195, 0f3E9E377A, f226;
mul.f32 f228, f218, 0f3F167918;
mul.f32 f229, f220, 0f3F737871;
sub.f32 f230, f229, f228;
add.f32 f231, f230, f227;
sub.f32 f232, f227, f230;
add.f32 f233, %68, %98;
add.f32 f235, %78, %88;
add.f32 f1391, %58, f233;
add.f32 f236, f235, f1391;
add.f32 f237, %113, %112;
add.f32 f239, %114, %89;
add.f32 f1387, %59, f237;
add.f32 f240, f239, f1387;
fma.rn.f32 f1385, f233, 0f3E9E377A, %58;
mul.f32 f1386, f235, 0f3F4F1BBD;
sub.f32 f243, f1385, f1386;
sub.f32 f244, %113, %112;
sub.f32 f246, %114, %89;
mul.f32 f1383, f244, 0f3F737871;
mul.f32 f1384, f246, 0fBF167918;
sub.f32 f248, f1384, f1383;
sub.f32 f249, f243, f248;
add.f32 f250, f248, f243;
mul.f32 f251, f233, 0f3F4F1BBD;
sub.f32 f252, %58, f251;
fma.rn.f32 f253, f235, 0f3E9E377A, f252;
mul.f32 f254, f244, 0f3F167918;
mul.f32 f255, f246, 0f3F737871;
sub.f32 f256, f255, f254;
sub.f32 f257, f253, f256;
add.f32 f258, f256, f253;
mul.f32 f260, f239, 0f3F4F1BBD;
fma.rn.f32 f1382, f237, 0f3E9E377A, %59;
sub.f32 f261, f1382, f260;
sub.f32 f262, %68, %98;
sub.f32 f264, %78, %88;
mul.f32 f1380, f262, 0f3F737871;
mul.f32 f1381, f264, 0fBF167918;
sub.f32 f266, f1381, f1380;
add.f32 f267, f266, f261;
sub.f32 f268, f261, f266;
mul.f32 f269, f237, 0f3F4F1BBD;
sub.f32 f270, %59, f269;
fma.rn.f32 f271, f239, 0f3E9E377A, f270;
mul.f32 f272, f262, 0f3F167918;
mul.f32 f273, f264, 0f3F737871;
sub.f32 f274, f273, f272;
add.f32 f275, f274, f271;
sub.f32 f276, f271, f274;
add.f32 f277, %70, %100;
add.f32 f279, %80, %90;
add.f32 f1379, %60, f277;
add.f32 f280, f279, f1379;
add.f32 f281, %71, %101;
add.f32 f283, %117, %115;
add.f32 f1375, %116, f281;
add.f32 f284, f283, f1375;
mul.f32 f286, f279, 0f3F4F1BBD;
fma.rn.f32 f1374, f277, 0f3E9E377A, %60;
sub.f32 f287, f1374, f286;
sub.f32 f288, %71, %101;
sub.f32 f290, %117, %115;
mul.f32 f1372, f288, 0f3F737871;
mul.f32 f1373, f290, 0fBF167918;
sub.f32 f292, f1373, f1372;
sub.f32 f293, f287, f292;
add.f32 f294, f292, f287;
mul.f32 f295, f277, 0f3F4F1BBD;
sub.f32 f296, %60, f295;
fma.rn.f32 f297, f279, 0f3E9E377A, f296;
mul.f32 f298, f288, 0f3F167918;
mul.f32 f299, f290, 0f3F737871;
sub.f32 f300, f299, f298;
sub.f32 f301, f297, f300;
add.f32 f302, f300, f297;
mul.f32 f304, f283, 0f3F4F1BBD;
fma.rn.f32 f1371, f281, 0f3E9E377A, %116;
sub.f32 f305, f1371, f304;
sub.f32 f306, %70, %100;
sub.f32 f308, %80, %90;
mul.f32 f1369, f306, 0f3F737871;
mul.f32 f1370, f308, 0fBF167918;
sub.f32 f310, f1370, f1369;
add.f32 f311, f310, f305;
sub.f32 f312, f305, f310;
mul.f32 f313, f281, 0f3F4F1BBD;
sub.f32 f314, %116, f313;
fma.rn.f32 f315, f283, 0f3E9E377A, f314;
mul.f32 f316, f306, 0f3F167918;
mul.f32 f317, f308, 0f3F737871;
sub.f32 f318, f317, f316;
add.f32 f319, f318, f315;
sub.f32 f320, f315, f318;
mul.f32 f322, f179, 0fBE7EA890;
mul.f32 f1368, f161, 0f3F77F511;
sub.f32 f323, f1368, f322;
mul.f32 f324, f179, 0f3F77F511;
fma.rn.f32 f325, f161, 0fBE7EA890, f324;
mul.f32 f327, f223, 0fBEF6A86B;
mul.f32 f1367, f205, 0f3F6055A2;
sub.f32 f328, f1367, f327;
mul.f32 f329, f223, 0f3F6055A2;
fma.rn.f32 f330, f205, 0fBEF6A86B, f329;
mul.f32 f332, f267, 0fBF2F3E7B;
mul.f32 f1366, f249, 0f3F3A9DB0;
sub.f32 f333, f1366, f332;
mul.f32 f334, f267, 0f3F3A9DB0;
fma.rn.f32 f335, f249, 0fBF2F3E7B, f334;
mul.f32 f337, f311, 0fBF5825E0;
mul.f32 f1365, f293, 0f3F092BF2;
sub.f32 f338, f1365, f337;
mul.f32 f339, f311, 0f3F092BF2;
fma.rn.f32 f340, f293, 0fBF5825E0, f339;
mul.f32 f342, f187, 0fBEF6A86B;
mul.f32 f1364, f169, 0f3F6055A2;
sub.f32 f343, f1364, f342;
mul.f32 f344, f187, 0f3F6055A2;
fma.rn.f32 f345, f169, 0fBEF6A86B, f344;
mul.f32 f1362, f213, 0f3F092BF2;
mul.f32 f1363, f231, 0fBF5825E0;
sub.f32 f348, f1362, f1363;
mul.f32 f349, f231, 0f3F092BF2;
fma.rn.f32 f350, f213, 0fBF5825E0, f349;
mul.f32 f1360, f257, 0f3D809851;
mul.f32 f1361, f275, 0fBF7F7EAE;
sub.f32 f353, f1360, f1361;
mul.f32 f354, f275, 0f3D809851;
fma.rn.f32 f355, f257, 0fBF7F7EAE, f354;
mul.f32 f1358, f301, 0fBED9FFBE;
mul.f32 f1359, f319, 0fBF67A2BF;
sub.f32 f358, f1358, f1359;
mul.f32 f359, f319, 0fBED9FFBE;
fma.rn.f32 f360, f301, 0fBF67A2BF, f359;
mul.f32 f1356, f170, 0f3F3A9DB0;
mul.f32 f1357, f188, 0fBF2F3E7B;
sub.f32 f363, f1356, f1357;
mul.f32 f364, f188, 0f3F3A9DB0;
fma.rn.f32 f365, f170, 0fBF2F3E7B, f364;
mul.f32 f367, f232, 0fBF7F7EAE;
mul.f32 f1355, f214, 0f3D809851;
sub.f32 f368, f1355, f367;
mul.f32 f369, f232, 0f3D809851;
fma.rn.f32 f370, f214, 0fBF7F7EAE, f369;
mul.f32 f372, f276, 0fBF45405B;
mul.f32 f1354, f258, 0fBF232E38;
sub.f32 f373, f1354, f372;
mul.f32 f374, f276, 0fBF232E38;
fma.rn.f32 f375, f258, 0fBF45405B, f374;
mul.f32 f377, f320, 0fBE00575B;
mul.f32 f1353, f302, 0fBF7DFB3B;
sub.f32 f378, f1353, f377;
mul.f32 f379, f320, 0fBF7DFB3B;
fma.rn.f32 f380, f302, 0fBE00575B, f379;
mul.f32 f382, f180, 0fBF5825E0;
mul.f32 f1352, f162, 0f3F092BF2;
sub.f32 f383, f1352, f382;
mul.f32 f384, f180, 0f3F092BF2;
fma.rn.f32 f385, f162, 0fBF5825E0, f384;
mul.f32 f387, f224, 0fBF67A2BF;
mul.f32 f1351, f206, 0fBED9FFBE;
sub.f32 f388, f1351, f387;
mul.f32 f389, f224, 0fBED9FFBE;
fma.rn.f32 f390, f206, 0fBF67A2BF, f389;
mul.f32 f1349, f250, 0fBF7DFB3B;
mul.f32 f1350, f268, 0fBE00575B;
sub.f32 f393, f1349, f1350;
mul.f32 f394, f268, 0fBF7DFB3B;
fma.rn.f32 f395, f250, 0fBE00575B, f394;
mul.f32 f1347, f294, 0fBF232E38;
mul.f32 f1348, f312, 0f3F45405B;
sub.f32 f398, f1347, f1348;
mul.f32 f399, f312, 0fBF232E38;
fma.rn.f32 f400, f294, 0f3F45405B, f399;
add.f32 f401, f148, f280;
add.f32 f403, f192, f236;
mul.f32 f408, f403, 0f3F4F1BBD;
fma.rn.f32 f1346, f401, 0f3E9E377A, f104;
sub.f32 f409, f1346, f408;
add.f32 f1345, f152, f284;
sub.f32 f410, f152, f284;
add.f32 f1344, f196, f240;
sub.f32 f412, f196, f240;
mul.f32 f413, f412, 0fBF167918;
mul.f32 f1343, f410, 0f3F737871;
sub.f32 f414, f413, f1343;
sub.f32 f415, f409, f414;
add.f32 f416, f414, f409;
add.f32 f1342, f104, f401;
mul.f32 f417, f401, 0f3F4F1BBD;
sub.f32 f418, f104, f417;
fma.rn.f32 f419, f403, 0f3E9E377A, f418;
mul.f32 f420, f410, 0f3F167918;
mul.f32 f421, f412, 0f3F737871;
sub.f32 f422, f421, f420;
sub.f32 f423, f419, f422;
add.f32 f424, f422, f419;
fma.rn.f32 f1340, f1345, 0f3E9E377A, f108;
mul.f32 f1341, f1344, 0f3F4F1BBD;
sub.f32 f427, f1340, f1341;
sub.f32 f428, f148, f280;
sub.f32 f430, f192, f236;
mul.f32 f1338, f428, 0f3F737871;
mul.f32 f1339, f430, 0fBF167918;
sub.f32 f432, f1339, f1338;
add.f32 f433, f432, f427;
sub.f32 f434, f427, f432;
add.f32 f1337, f108, f1345;
mul.f32 f435, f1345, 0f3F4F1BBD;
sub.f32 f436, f108, f435;
fma.rn.f32 f437, f1344, 0f3E9E377A, f436;
mul.f32 f438, f428, 0f3F167918;
mul.f32 f439, f430, 0f3F737871;
sub.f32 f440, f439, f438;
add.f32 f441, f440, f437;
sub.f32 f442, f437, f440;
add.f32 f443, f323, f338;
add.f32 f445, f328, f333;
add.f32 f1336, f117, f443;
add.f32 f446, f445, f1336;
add.f32 f447, f325, f340;
add.f32 f449, f330, f335;
add.f32 f1335, f135, f447;
add.f32 f450, f449, f1335;
fma.rn.f32 f1333, f443, 0f3E9E377A, f117;
mul.f32 f1334, f445, 0f3F4F1BBD;
sub.f32 f453, f1333, f1334;
sub.f32 f454, f325, f340;
sub.f32 f456, f330, f335;
mul.f32 f1331, f454, 0f3F737871;
mul.f32 f1332, f456, 0fBF167918;
sub.f32 f458, f1332, f1331;
sub.f32 f459, f453, f458;
add.f32 f460, f458, f453;
mul.f32 f461, f443, 0f3F4F1BBD;
sub.f32 f462, f117, f461;
fma.rn.f32 f463, f445, 0f3E9E377A, f462;
mul.f32 f464, f454, 0f3F167918;
mul.f32 f465, f456, 0f3F737871;
sub.f32 f466, f465, f464;
sub.f32 f467, f463, f466;
add.f32 f468, f466, f463;
mul.f32 f470, f449, 0f3F4F1BBD;
fma.rn.f32 f1330, f447, 0f3E9E377A, f135;
sub.f32 f471, f1330, f470;
sub.f32 f472, f323, f338;
sub.f32 f474, f328, f333;
mul.f32 f1328, f472, 0f3F737871;
mul.f32 f1329, f474, 0fBF167918;
sub.f32 f476, f1329, f1328;
add.f32 f477, f476, f471;
sub.f32 f478, f471, f476;
mul.f32 f479, f447, 0f3F4F1BBD;
sub.f32 f480, f135, f479;
fma.rn.f32 f481, f449, 0f3E9E377A, f480;
mul.f32 f482, f472, 0f3F167918;
mul.f32 f483, f474, 0f3F737871;
sub.f32 f484, f483, f482;
add.f32 f485, f484, f481;
sub.f32 f486, f481, f484;
add.f32 f487, f343, f358;
add.f32 f489, f348, f353;
add.f32 f1327, f125, f487;
add.f32 f490, f489, f1327;
add.f32 f491, f345, f360;
add.f32 f493, f350, f355;
add.f32 f1326, f143, f491;
add.f32 f494, f493, f1326;
mul.f32 f496, f489, 0f3F4F1BBD;
fma.rn.f32 f1325, f487, 0f3E9E377A, f125;
sub.f32 f497, f1325, f496;
sub.f32 f498, f345, f360;
sub.f32 f500, f350, f355;
mul.f32 f1323, f498, 0f3F737871;
mul.f32 f1324, f500, 0fBF167918;
sub.f32 f502, f1324, f1323;
sub.f32 f503, f497, f502;
add.f32 f504, f502, f497;
mul.f32 f505, f487, 0f3F4F1BBD;
sub.f32 f506, f125, f505;
fma.rn.f32 f507, f489, 0f3E9E377A, f506;
mul.f32 f508, f498, 0f3F167918;
mul.f32 f509, f500, 0f3F737871;
sub.f32 f510, f509, f508;
sub.f32 f511, f507, f510;
add.f32 f512, f510, f507;
mul.f32 f514, f493, 0f3F4F1BBD;
fma.rn.f32 f1322, f491, 0f3E9E377A, f143;
sub.f32 f515, f1322, f514;
sub.f32 f516, f343, f358;
sub.f32 f518, f348, f353;
mul.f32 f1320, f516, 0f3F737871;
mul.f32 f1321, f518, 0fBF167918;
sub.f32 f520, f1321, f1320;
add.f32 f521, f520, f515;
sub.f32 f522, f515, f520;
mul.f32 f523, f491, 0f3F4F1BBD;
sub.f32 f524, f143, f523;
fma.rn.f32 f525, f493, 0f3E9E377A, f524;
mul.f32 f526, f516, 0f3F167918;
mul.f32 f527, f518, 0f3F737871;
sub.f32 f528, f527, f526;
add.f32 f529, f528, f525;
sub.f32 f530, f525, f528;
add.f32 f531, f363, f378;
add.f32 f533, f368, f373;
add.f32 f1319, f126, f531;
add.f32 f534, f533, f1319;
add.f32 f535, f365, f380;
add.f32 f537, f370, f375;
add.f32 f1318, f144, f535;
add.f32 f538, f537, f1318;
mul.f32 f540, f533, 0f3F4F1BBD;
fma.rn.f32 f1317, f531, 0f3E9E377A, f126;
sub.f32 f541, f1317, f540;
sub.f32 f542, f365, f380;
sub.f32 f544, f370, f375;
mul.f32 f1315, f542, 0f3F737871;
mul.f32 f1316, f544, 0fBF167918;
sub.f32 f546, f1316, f1315;
sub.f32 f547, f541, f546;
add.f32 f548, f546, f541;
mul.f32 f549, f531, 0f3F4F1BBD;
sub.f32 f550, f126, f549;
fma.rn.f32 f551, f533, 0f3E9E377A, f550;
mul.f32 f552, f542, 0f3F167918;
mul.f32 f553, f544, 0f3F737871;
sub.f32 f554, f553, f552;
sub.f32 f555, f551, f554;
add.f32 f556, f554, f551;
fma.rn.f32 f1313, f535, 0f3E9E377A, f144;
mul.f32 f1314, f537, 0f3F4F1BBD;
sub.f32 f559, f1313, f1314;
sub.f32 f560, f363, f378;
sub.f32 f562, f368, f373;
mul.f32 f1311, f560, 0f3F737871;
mul.f32 f1312, f562, 0fBF167918;
sub.f32 f564, f1312, f1311;
add.f32 f565, f564, f559;
sub.f32 f566, f559, f564;
mul.f32 f567, f535, 0f3F4F1BBD;
sub.f32 f568, f144, f567;
fma.rn.f32 f569, f537, 0f3E9E377A, f568;
mul.f32 f570, f560, 0f3F167918;
mul.f32 f571, f562, 0f3F737871;
sub.f32 f572, f571, f570;
add.f32 f573, f572, f569;
sub.f32 f574, f569, f572;
add.f32 f575, f383, f398;
add.f32 f577, f388, f393;
add.f32 f1310, f118, f575;
add.f32 f578, f577, f1310;
add.f32 f579, f385, f400;
add.f32 f581, f390, f395;
add.f32 f1309, f136, f579;
add.f32 f582, f581, f1309;
fma.rn.f32 f1307, f575, 0f3E9E377A, f118;
mul.f32 f1308, f577, 0f3F4F1BBD;
sub.f32 f585, f1307, f1308;
sub.f32 f586, f385, f400;
sub.f32 f588, f390, f395;
mul.f32 f1305, f586, 0f3F737871;
mul.f32 f1306, f588, 0fBF167918;
sub.f32 f590, f1306, f1305;
sub.f32 f591, f585, f590;
add.f32 f592, f590, f585;
mul.f32 f593, f575, 0f3F4F1BBD;
sub.f32 f594, f118, f593;
fma.rn.f32 f595, f577, 0f3E9E377A, f594;
mul.f32 f596, f586, 0f3F167918;
mul.f32 f597, f588, 0f3F737871;
sub.f32 f598, f597, f596;
sub.f32 f599, f595, f598;
add.f32 f600, f598, f595;
mul.f32 f602, f581, 0f3F4F1BBD;
fma.rn.f32 f1304, f579, 0f3E9E377A, f136;
sub.f32 f603, f1304, f602;
sub.f32 f604, f383, f398;
sub.f32 f606, f388, f393;
mul.f32 f1302, f604, 0f3F737871;
mul.f32 f1303, f606, 0fBF167918;
sub.f32 f608, f1303, f1302;
add.f32 f609, f608, f603;
sub.f32 f610, f603, f608;
mul.f32 f611, f579, 0f3F4F1BBD;
sub.f32 f612, f136, f611;
fma.rn.f32 f613, f581, 0f3E9E377A, f612;
mul.f32 f614, f604, 0f3F167918;
mul.f32 f615, f606, 0f3F737871;
sub.f32 f616, f615, f614;
add.f32 f617, f616, f613;
sub.f32 f618, f613, f616;
mov.u32 r11, %tid.x;
mul.wide.u32 rd2, r11, -858993459;
shr.u64 rd3, rd2, 34;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 5;
sub.s32 r7, r11, r6;
mad.lo.s32 r8, r5, 1000, r3;
mov.u64 rd5, %51;
mul.wide.u32 rd7, r7, 8;
add.s64 rd6, rd5, rd7;
ld.global.v2.f32 {f619, f620}, [rd6];
mul.f32 f624, f620, f450;
mul.f32 f625, f619, f450;
mul.f32 f627, f620, f620;
mul.f32 f1301, f619, f619;
sub.f32 f628, f1301, f627;
mul.f32 f629, f620, f619;
fma.rn.f32 f630, f620, f619, f629;
mul.f32 f632, f630, f494;
mul.f32 f633, f628, f494;
mul.f32 f1299, f619, f628;
mul.f32 f1300, f620, f630;
sub.f32 f636, f1299, f1300;
mul.f32 f1298, f628, f490;
mul.f32 f637, f619, f630;
fma.rn.f32 f638, f620, f628, f637;
mul.f32 f640, f638, f538;
mul.f32 f641, f636, f538;
mul.f32 f643, f620, f638;
mul.f32 f1297, f619, f636;
sub.f32 f644, f1297, f643;
mul.f32 f1296, f636, f534;
mul.f32 f645, f619, f638;
fma.rn.f32 f646, f620, f636, f645;
mul.f32 f648, f646, f582;
mul.f32 f649, f644, f582;
mul.f32 f1294, f619, f644;
mul.f32 f1295, f620, f646;
sub.f32 f652, f1294, f1295;
mul.f32 f1293, f644, f578;
mul.f32 f653, f619, f646;
fma.rn.f32 f654, f620, f644, f653;
mul.f32 f656, f654, f433;
mul.f32 f657, f652, f433;
mul.f32 f659, f620, f654;
mul.f32 f1292, f619, f652;
sub.f32 f660, f1292, f659;
mul.f32 f1291, f652, f415;
mul.f32 f661, f619, f654;
fma.rn.f32 f662, f620, f652, f661;
mul.f32 f664, f662, f477;
mul.f32 f665, f660, f477;
mul.f32 f667, f620, f662;
mul.f32 f1290, f619, f660;
sub.f32 f668, f1290, f667;
mul.f32 f1289, f660, f459;
mul.f32 f669, f619, f662;
fma.rn.f32 f670, f620, f660, f669;
mul.f32 f672, f670, f521;
mul.f32 f673, f668, f521;
mul.f32 f1287, f619, f668;
mul.f32 f1288, f620, f670;
sub.f32 f676, f1287, f1288;
mul.f32 f1286, f668, f503;
mul.f32 f677, f619, f670;
fma.rn.f32 f678, f620, f668, f677;
mul.f32 f680, f678, f565;
mul.f32 f681, f676, f565;
mul.f32 f683, f620, f678;
mul.f32 f1285, f619, f676;
sub.f32 f684, f1285, f683;
mul.f32 f1284, f676, f547;
mul.f32 f685, f619, f678;
fma.rn.f32 f686, f620, f676, f685;
mul.f32 f688, f686, f609;
mul.f32 f689, f684, f609;
mul.f32 f691, f620, f686;
mul.f32 f1283, f619, f684;
sub.f32 f692, f1283, f691;
mul.f32 f1282, f684, f591;
mul.f32 f693, f619, f686;
fma.rn.f32 f694, f620, f684, f693;
mul.f32 f696, f694, f441;
mul.f32 f697, f692, f441;
mul.f32 f1280, f619, f692;
mul.f32 f1281, f620, f694;
sub.f32 f700, f1280, f1281;
mul.f32 f1279, f692, f423;
mul.f32 f701, f619, f694;
fma.rn.f32 f702, f620, f692, f701;
mul.f32 f704, f702, f485;
mul.f32 f705, f700, f485;
mul.f32 f707, f620, f702;
mul.f32 f1278, f619, f700;
sub.f32 f708, f1278, f707;
mul.f32 f1277, f700, f467;
mul.f32 f709, f619, f702;
fma.rn.f32 f710, f620, f700, f709;
mul.f32 f712, f710, f529;
mul.f32 f713, f708, f529;
mul.f32 f1275, f619, f708;
mul.f32 f1276, f620, f710;
sub.f32 f716, f1275, f1276;
mul.f32 f1274, f708, f511;
mul.f32 f717, f619, f710;
fma.rn.f32 f718, f620, f708, f717;
mul.f32 f720, f718, f573;
mul.f32 f721, f716, f573;
mul.f32 f723, f620, f718;
mul.f32 f1273, f619, f716;
sub.f32 f724, f1273, f723;
mul.f32 f1272, f716, f555;
mul.f32 f725, f619, f718;
fma.rn.f32 f726, f620, f716, f725;
mul.f32 f728, f726, f617;
mul.f32 f729, f724, f617;
mul.f32 f731, f620, f726;
mul.f32 f1271, f619, f724;
sub.f32 f732, f1271, f731;
mul.f32 f1270, f724, f599;
mul.f32 f733, f619, f726;
fma.rn.f32 f734, f620, f724, f733;
mul.f32 f736, f734, f442;
mul.f32 f737, f732, f442;
mul.f32 f1268, f619, f732;
mul.f32 f1269, f620, f734;
sub.f32 f740, f1268, f1269;
mul.f32 f1267, f732, f424;
mul.f32 f741, f619, f734;
fma.rn.f32 f742, f620, f732, f741;
mul.f32 f744, f742, f486;
mul.f32 f745, f740, f486;
mul.f32 f747, f620, f742;
mul.f32 f1266, f619, f740;
sub.f32 f748, f1266, f747;
mul.f32 f1265, f740, f468;
mul.f32 f749, f619, f742;
fma.rn.f32 f750, f620, f740, f749;
mul.f32 f752, f750, f530;
mul.f32 f753, f748, f530;
mul.f32 f755, f620, f750;
mul.f32 f1264, f619, f748;
sub.f32 f756, f1264, f755;
mul.f32 f1263, f748, f512;
mul.f32 f757, f619, f750;
fma.rn.f32 f758, f620, f748, f757;
mul.f32 f760, f758, f574;
mul.f32 f761, f756, f574;
mul.f32 f1261, f619, f756;
mul.f32 f1262, f620, f758;
sub.f32 f764, f1261, f1262;
mul.f32 f1260, f756, f556;
mul.f32 f765, f619, f758;
fma.rn.f32 f766, f620, f756, f765;
mul.f32 f768, f766, f618;
mul.f32 f769, f764, f618;
mul.f32 f771, f620, f766;
mul.f32 f1259, f619, f764;
sub.f32 f772, f1259, f771;
mul.f32 f1258, f764, f600;
mul.f32 f773, f619, f766;
fma.rn.f32 f774, f620, f764, f773;
mul.f32 f776, f774, f434;
mul.f32 f777, f772, f434;
mul.f32 f1256, f619, f772;
mul.f32 f1257, f620, f774;
sub.f32 f780, f1256, f1257;
mul.f32 f1255, f772, f416;
mul.f32 f781, f619, f774;
fma.rn.f32 f782, f620, f772, f781;
mul.f32 f784, f782, f478;
mul.f32 f785, f780, f478;
mul.f32 f787, f620, f782;
mul.f32 f1254, f619, f780;
sub.f32 f788, f1254, f787;
mul.f32 f1253, f780, f460;
mul.f32 f789, f619, f782;
fma.rn.f32 f790, f620, f780, f789;
mul.f32 f792, f790, f522;
mul.f32 f793, f788, f522;
mul.f32 f795, f620, f790;
mul.f32 f1252, f619, f788;
sub.f32 f796, f1252, f795;
mul.f32 f1251, f788, f504;
mul.f32 f797, f619, f790;
fma.rn.f32 f798, f620, f788, f797;
mul.f32 f800, f798, f566;
mul.f32 f801, f796, f566;
mul.f32 f1249, f619, f796;
mul.f32 f1250, f620, f798;
sub.f32 f804, f1249, f1250;
mul.f32 f1248, f619, f446;
mul.f32 f805, f619, f798;
mul.f32 f1247, f796, f548;
fma.rn.f32 f806, f620, f796, f805;
mul.f32 f807, f804, f592;
mul.f32 f808, f806, f610;
mul.f32 f809, f804, f610;
barrier.sync 0;
mad.lo.s32 r9, r7, 200, r8;
add.f32 f810, f1344, f1337;
add.f32 f811, f403, f1342;
st.shared.v2.f32 [r9], {f811, f810};
fma.rn.f32 f812, f620, f446, f625;
sub.f32 f813, f1248, f624;
st.shared.v2.f32 [r9+8], {f813, f812};
fma.rn.f32 f814, f630, f490, f633;
sub.f32 f815, f1298, f632;
st.shared.v2.f32 [r9+16], {f815, f814};
fma.rn.f32 f816, f638, f534, f641;
sub.f32 f817, f1296, f640;
st.shared.v2.f32 [r9+24], {f817, f816};
fma.rn.f32 f818, f646, f578, f649;
sub.f32 f819, f1293, f648;
st.shared.v2.f32 [r9+32], {f819, f818};
sub.f32 f820, f1291, f656;
fma.rn.f32 f821, f654, f415, f657;
st.shared.v2.f32 [r9+40], {f820, f821};
fma.rn.f32 f822, f662, f459, f665;
sub.f32 f823, f1289, f664;
st.shared.v2.f32 [r9+48], {f823, f822};
sub.f32 f824, f1286, f672;
fma.rn.f32 f825, f670, f503, f673;
st.shared.v2.f32 [r9+56], {f824, f825};
fma.rn.f32 f826, f678, f547, f681;
sub.f32 f827, f1284, f680;
st.shared.v2.f32 [r9+64], {f827, f826};
fma.rn.f32 f828, f686, f591, f689;
sub.f32 f829, f1282, f688;
st.shared.v2.f32 [r9+72], {f829, f828};
fma.rn.f32 f830, f694, f423, f697;
sub.f32 f831, f1279, f696;
st.shared.v2.f32 [r9+80], {f831, f830};
fma.rn.f32 f832, f702, f467, f705;
sub.f32 f833, f1277, f704;
st.shared.v2.f32 [r9+88], {f833, f832};
fma.rn.f32 f834, f710, f511, f713;
sub.f32 f835, f1274, f712;
st.shared.v2.f32 [r9+96], {f835, f834};
fma.rn.f32 f836, f718, f555, f721;
sub.f32 f837, f1272, f720;
st.shared.v2.f32 [r9+104], {f837, f836};
fma.rn.f32 f838, f726, f599, f729;
sub.f32 f839, f1270, f728;
st.shared.v2.f32 [r9+112], {f839, f838};
fma.rn.f32 f840, f734, f424, f737;
sub.f32 f841, f1267, f736;
st.shared.v2.f32 [r9+120], {f841, f840};
fma.rn.f32 f842, f742, f468, f745;
sub.f32 f843, f1265, f744;
st.shared.v2.f32 [r9+128], {f843, f842};
fma.rn.f32 f844, f750, f512, f753;
sub.f32 f845, f1263, f752;
st.shared.v2.f32 [r9+136], {f845, f844};
fma.rn.f32 f846, f758, f556, f761;
sub.f32 f847, f1260, f760;
st.shared.v2.f32 [r9+144], {f847, f846};
fma.rn.f32 f848, f766, f600, f769;
sub.f32 f849, f1258, f768;
st.shared.v2.f32 [r9+152], {f849, f848};
fma.rn.f32 f850, f774, f416, f777;
sub.f32 f851, f1255, f776;
st.shared.v2.f32 [r9+160], {f851, f850};
fma.rn.f32 f852, f782, f460, f785;
sub.f32 f853, f1253, f784;
st.shared.v2.f32 [r9+168], {f853, f852};
fma.rn.f32 f854, f790, f504, f793;
sub.f32 f855, f1251, f792;
st.shared.v2.f32 [r9+176], {f855, f854};
fma.rn.f32 f856, f798, f548, f801;
sub.f32 f857, f1247, f800;
st.shared.v2.f32 [r9+184], {f857, f856};
fma.rn.f32 f858, f806, f592, f809;
sub.f32 f859, f807, f808;
st.shared.v2.f32 [r9+192], {f859, f858};
barrier.sync 0;
mad.lo.s32 r10, r7, -192, r9;
ld.shared.v2.f32 {f860, f861}, [r10];
ld.shared.v2.f32 {f864, f865}, [r10+40];
ld.shared.v2.f32 {f868, f869}, [r10+80];
ld.shared.v2.f32 {f872, f873}, [r10+120];
ld.shared.v2.f32 {f876, f877}, [r10+160];
ld.shared.v2.f32 {f880, f881}, [r10+200];
ld.shared.v2.f32 {f884, f885}, [r10+240];
ld.shared.v2.f32 {f888, f889}, [r10+280];
ld.shared.v2.f32 {f892, f893}, [r10+320];
ld.shared.v2.f32 {f896, f897}, [r10+360];
ld.shared.v2.f32 {f900, f901}, [r10+400];
ld.shared.v2.f32 {f904, f905}, [r10+440];
ld.shared.v2.f32 {f908, f909}, [r10+480];
ld.shared.v2.f32 {f912, f913}, [r10+520];
ld.shared.v2.f32 {f916, f917}, [r10+560];
ld.shared.v2.f32 {f920, f921}, [r10+600];
ld.shared.v2.f32 {f924, f925}, [r10+640];
ld.shared.v2.f32 {f928, f929}, [r10+680];
ld.shared.v2.f32 {f932, f933}, [r10+720];
ld.shared.v2.f32 {f936, f937}, [r10+760];
ld.shared.v2.f32 {f940, f941}, [r10+800];
ld.shared.v2.f32 {f944, f945}, [r10+840];
ld.shared.v2.f32 {f948, f949}, [r10+880];
ld.shared.v2.f32 {f952, f953}, [r10+920];
ld.shared.v2.f32 {f956, f957}, [r10+960];
add.f32 f960, f880, f940;
add.f32 f962, f900, f920;
fma.rn.f32 f1245, f960, 0f3E9E377A, f860;
mul.f32 f1246, f962, 0f3F4F1BBD;
sub.f32 f968, f1245, f1246;
add.f32 f1244, f881, f941;
sub.f32 f969, f881, f941;
add.f32 f1243, f901, f921;
sub.f32 f971, f901, f921;
mul.f32 f1241, f969, 0f3F737871;
mul.f32 f1242, f971, 0fBF167918;
sub.f32 f973, f1242, f1241;
add.f32 f1240, f860, f960;
mul.f32 f974, f960, 0f3F4F1BBD;
sub.f32 f975, f860, f974;
fma.rn.f32 f976, f962, 0f3E9E377A, f975;
mul.f32 f977, f969, 0f3F167918;
mul.f32 f978, f971, 0f3F737871;
sub.f32 f979, f978, f977;
fma.rn.f32 f1238, f1244, 0f3E9E377A, f861;
mul.f32 f1239, f1243, 0f3F4F1BBD;
sub.f32 f982, f1238, f1239;
sub.f32 f983, f880, f940;
sub.f32 f985, f900, f920;
mul.f32 f1236, f983, 0f3F737871;
mul.f32 f1237, f985, 0fBF167918;
sub.f32 f987, f1237, f1236;
add.f32 f1235, f861, f1244;
mul.f32 f988, f1244, 0f3F4F1BBD;
sub.f32 f989, f861, f988;
fma.rn.f32 f990, f1243, 0f3E9E377A, f989;
mul.f32 f991, f983, 0f3F167918;
mul.f32 f992, f985, 0f3F737871;
sub.f32 f993, f992, f991;
add.f32 f994, f884, f944;
add.f32 f996, f904, f924;
mul.f32 f1001, f996, 0f3F4F1BBD;
fma.rn.f32 f1234, f994, 0f3E9E377A, f864;
sub.f32 f1002, f1234, f1001;
add.f32 f1233, f885, f945;
sub.f32 f1003, f885, f945;
add.f32 f1232, f905, f925;
sub.f32 f1005, f905, f925;
mul.f32 f1230, f1003, 0f3F737871;
mul.f32 f1231, f1005, 0fBF167918;
sub.f32 f1007, f1231, f1230;
add.f32 f1229, f864, f994;
mul.f32 f1008, f994, 0f3F4F1BBD;
sub.f32 f1009, f864, f1008;
fma.rn.f32 f1010, f996, 0f3E9E377A, f1009;
mul.f32 f1011, f1003, 0f3F167918;
mul.f32 f1012, f1005, 0f3F737871;
sub.f32 f1013, f1012, f1011;
mul.f32 f1015, f1232, 0f3F4F1BBD;
fma.rn.f32 f1228, f1233, 0f3E9E377A, f865;
sub.f32 f1016, f1228, f1015;
sub.f32 f1017, f884, f944;
sub.f32 f1019, f904, f924;
mul.f32 f1226, f1017, 0f3F737871;
mul.f32 f1227, f1019, 0fBF167918;
sub.f32 f1021, f1227, f1226;
add.f32 f1225, f865, f1233;
mul.f32 f1022, f1233, 0f3F4F1BBD;
sub.f32 f1023, f865, f1022;
fma.rn.f32 f1024, f1232, 0f3E9E377A, f1023;
mul.f32 f1025, f1017, 0f3F167918;
mul.f32 f1026, f1019, 0f3F737871;
sub.f32 f1027, f1026, f1025;
add.f32 f1028, f888, f948;
add.f32 f1030, f908, f928;
mul.f32 f1035, f1030, 0f3F4F1BBD;
fma.rn.f32 f1224, f1028, 0f3E9E377A, f868;
sub.f32 f1036, f1224, f1035;
add.f32 f1223, f889, f949;
sub.f32 f1037, f889, f949;
add.f32 f1222, f909, f929;
sub.f32 f1039, f909, f929;
mul.f32 f1220, f1037, 0f3F737871;
mul.f32 f1221, f1039, 0fBF167918;
sub.f32 f1041, f1221, f1220;
add.f32 f1219, f868, f1028;
mul.f32 f1042, f1028, 0f3F4F1BBD;
sub.f32 f1043, f868, f1042;
fma.rn.f32 f1044, f1030, 0f3E9E377A, f1043;
mul.f32 f1045, f1037, 0f3F167918;
mul.f32 f1046, f1039, 0f3F737871;
sub.f32 f1047, f1046, f1045;
mul.f32 f1049, f1222, 0f3F4F1BBD;
fma.rn.f32 f1218, f1223, 0f3E9E377A, f869;
sub.f32 f1050, f1218, f1049;
sub.f32 f1051, f888, f948;
sub.f32 f1053, f908, f928;
mul.f32 f1054, f1053, 0fBF167918;
mul.f32 f1217, f1051, 0f3F737871;
sub.f32 f1055, f1054, f1217;
add.f32 f1216, f869, f1223;
mul.f32 f1056, f1223, 0f3F4F1BBD;
sub.f32 f1057, f869, f1056;
fma.rn.f32 f1058, f1222, 0f3E9E377A, f1057;
mul.f32 f1059, f1051, 0f3F167918;
mul.f32 f1060, f1053, 0f3F737871;
sub.f32 f1061, f1060, f1059;
add.f32 f1062, f892, f952;
add.f32 f1064, f912, f932;
fma.rn.f32 f1214, f1062, 0f3E9E377A, f872;
mul.f32 f1215, f1064, 0f3F4F1BBD;
sub.f32 f1070, f1214, f1215;
add.f32 f1213, f893, f953;
sub.f32 f1071, f893, f953;
add.f32 f1212, f913, f933;
sub.f32 f1073, f913, f933;
mul.f32 f1210, f1071, 0f3F737871;
mul.f32 f1211, f1073, 0fBF167918;
sub.f32 f1075, f1211, f1210;
add.f32 f1209, f872, f1062;
mul.f32 f1076, f1062, 0f3F4F1BBD;
sub.f32 f1077, f872, f1076;
fma.rn.f32 f1078, f1064, 0f3E9E377A, f1077;
mul.f32 f1079, f1071, 0f3F167918;
mul.f32 f1080, f1073, 0f3F737871;
sub.f32 f1081, f1080, f1079;
fma.rn.f32 f1207, f1213, 0f3E9E377A, f873;
mul.f32 f1208, f1212, 0f3F4F1BBD;
sub.f32 f1084, f1207, f1208;
sub.f32 f1085, f892, f952;
sub.f32 f1087, f912, f932;
mul.f32 f1205, f1085, 0f3F737871;
mul.f32 f1206, f1087, 0fBF167918;
sub.f32 f1089, f1206, f1205;
add.f32 f1204, f873, f1213;
mul.f32 f1090, f1213, 0f3F4F1BBD;
sub.f32 f1091, f873, f1090;
fma.rn.f32 f1092, f1212, 0f3E9E377A, f1091;
mul.f32 f1093, f1085, 0f3F167918;
mul.f32 f1094, f1087, 0f3F737871;
sub.f32 f1095, f1094, f1093;
add.f32 f1096, f896, f956;
add.f32 f1098, f916, f936;
mul.f32 f1103, f1098, 0f3F4F1BBD;
fma.rn.f32 f1203, f1096, 0f3E9E377A, f876;
sub.f32 f1104, f1203, f1103;
add.f32 f1202, f897, f957;
sub.f32 f1105, f897, f957;
add.f32 f1201, f917, f937;
sub.f32 f1107, f917, f937;
mul.f32 f1199, f1105, 0f3F737871;
mul.f32 f1200, f1107, 0fBF167918;
sub.f32 f1109, f1200, f1199;
add.f32 f1198, f876, f1096;
mul.f32 f1110, f1096, 0f3F4F1BBD;
sub.f32 f1111, f876, f1110;
fma.rn.f32 f1112, f1098, 0f3E9E377A, f1111;
mul.f32 f1113, f1105, 0f3F167918;
mul.f32 f1114, f1107, 0f3F737871;
sub.f32 f1115, f1114, f1113;
mul.f32 f1117, f1201, 0f3F4F1BBD;
fma.rn.f32 f1197, f1202, 0f3E9E377A, f877;
sub.f32 f1118, f1197, f1117;
sub.f32 f1119, f896, f956;
sub.f32 f1121, f916, f936;
mul.f32 f1195, f1119, 0f3F737871;
mul.f32 f1196, f1121, 0fBF167918;
sub.f32 f1123, f1196, f1195;
add.f32 f1194, f877, f1202;
mul.f32 f1124, f1202, 0f3F4F1BBD;
sub.f32 f1125, f877, f1124;
fma.rn.f32 f1126, f1201, 0f3E9E377A, f1125;
mul.f32 f1127, f1119, 0f3F167918;
mul.f32 f1128, f1121, 0f3F737871;
sub.f32 f1129, f1128, f1127;
add.f32 %1, f1243, f1235;
add.f32 %0, f962, f1240;
add.f32 %3, f1232, f1225;
add.f32 %2, f996, f1229;
add.f32 %5, f1222, f1216;
add.f32 %4, f1030, f1219;
add.f32 %7, f1212, f1204;
add.f32 %6, f1064, f1209;
add.f32 %9, f1201, f1194;
add.f32 %8, f1098, f1198;
sub.f32 %10, f968, f973;
add.f32 %11, f987, f982;
sub.f32 %12, f1002, f1007;
add.f32 %13, f1021, f1016;
add.f32 %15, f1055, f1050;
sub.f32 %14, f1036, f1041;
add.f32 %17, f1089, f1084;
sub.f32 %16, f1070, f1075;
add.f32 %19, f1123, f1118;
sub.f32 %18, f1104, f1109;
sub.f32 %20, f976, f979;
add.f32 %21, f993, f990;
sub.f32 %22, f1010, f1013;
add.f32 %23, f1027, f1024;
sub.f32 %24, f1044, f1047;
add.f32 %25, f1061, f1058;
sub.f32 %26, f1078, f1081;
add.f32 %27, f1095, f1092;
add.f32 %29, f1129, f1126;
sub.f32 %28, f1112, f1115;
sub.f32 %31, f990, f993;
add.f32 %30, f979, f976;
sub.f32 %33, f1024, f1027;
add.f32 %32, f1013, f1010;
sub.f32 %35, f1058, f1061;
add.f32 %34, f1047, f1044;
sub.f32 %37, f1092, f1095;
add.f32 %36, f1081, f1078;
sub.f32 %39, f1126, f1129;
add.f32 %38, f1115, f1112;
sub.f32 %41, f982, f987;
add.f32 %40, f973, f968;
sub.f32 %43, f1016, f1021;
add.f32 %42, f1007, f1002;
sub.f32 %45, f1050, f1055;
add.f32 %44, f1041, f1036;
sub.f32 %47, f1084, f1089;
add.f32 %46, f1075, f1070;
sub.f32 %49, f1118, f1123;
add.f32 %48, f1109, f1104;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[5].y), "f"(rmem[10].y), "f"(rmem[20].y), "f"(rmem[16].y), "f"(rmem[1].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[17].y), "f"(rmem[23].y), "f"(rmem[8].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[4].y), "f"(rmem[14].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<158, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1130>;
.reg .b32 r<11>;
.reg .b64 rd<7>;
mov.u32 r1, %tid.y;
mov.u32 r2, %50;
mad.lo.s32 r3, r1, 500, r2;
add.f32 f101, %65, %105;
add.f32 f102, %52, f101;
add.f32 f103, %78, %92;
add.f32 f104, f103, f102;
add.f32 f105, %67, %107;
add.f32 f106, %53, f105;
add.f32 f107, %80, %93;
add.f32 f108, f107, f106;
fma.rn.f32 f109, f101, 0f3E9E377A, %52;
mul.f32 f110, f103, 0f3F4F1BBD;
sub.f32 f111, f109, f110;
sub.f32 f112, %67, %107;
mul.f32 f113, f112, 0f3F737871;
sub.f32 f114, %80, %93;
mul.f32 f115, f114, 0fBF167918;
sub.f32 f116, f115, f113;
sub.f32 f117, f111, f116;
add.f32 f118, f116, f111;
mul.f32 f119, f101, 0f3F4F1BBD;
sub.f32 f120, %52, f119;
fma.rn.f32 f121, f103, 0f3E9E377A, f120;
mul.f32 f122, f112, 0f3F167918;
mul.f32 f123, f114, 0f3F737871;
sub.f32 f124, f123, f122;
sub.f32 f125, f121, f124;
add.f32 f126, f124, f121;
fma.rn.f32 f127, f105, 0f3E9E377A, %53;
mul.f32 f128, f107, 0f3F4F1BBD;
sub.f32 f129, f127, f128;
sub.f32 f130, %65, %105;
mul.f32 f131, f130, 0f3F737871;
sub.f32 f132, %78, %92;
mul.f32 f133, f132, 0fBF167918;
sub.f32 f134, f133, f131;
add.f32 f135, f134, f129;
sub.f32 f136, f129, f134;
mul.f32 f137, f105, 0f3F4F1BBD;
sub.f32 f138, %53, f137;
fma.rn.f32 f139, f107, 0f3E9E377A, f138;
mul.f32 f140, f130, 0f3F167918;
mul.f32 f141, f132, 0f3F737871;
sub.f32 f142, f141, f140;
add.f32 f143, f142, f139;
sub.f32 f144, f139, f142;
add.f32 f145, %68, %108;
add.f32 f146, %54, f145;
add.f32 f147, %81, %94;
add.f32 f148, f147, f146;
add.f32 f149, %69, %109;
add.f32 f150, %56, f149;
add.f32 f151, %83, %96;
add.f32 f152, f151, f150;
fma.rn.f32 f153, f145, 0f3E9E377A, %54;
mul.f32 f154, f147, 0f3F4F1BBD;
sub.f32 f155, f153, f154;
sub.f32 f156, %69, %109;
mul.f32 f157, f156, 0f3F737871;
sub.f32 f158, %83, %96;
mul.f32 f159, f158, 0fBF167918;
sub.f32 f160, f159, f157;
sub.f32 f161, f155, f160;
add.f32 f162, f160, f155;
mul.f32 f163, f145, 0f3F4F1BBD;
sub.f32 f164, %54, f163;
fma.rn.f32 f165, f147, 0f3E9E377A, f164;
mul.f32 f166, f156, 0f3F167918;
mul.f32 f167, f158, 0f3F737871;
sub.f32 f168, f167, f166;
sub.f32 f169, f165, f168;
add.f32 f170, f168, f165;
fma.rn.f32 f171, f149, 0f3E9E377A, %56;
mul.f32 f172, f151, 0f3F4F1BBD;
sub.f32 f173, f171, f172;
sub.f32 f174, %68, %108;
mul.f32 f175, f174, 0f3F737871;
sub.f32 f176, %81, %94;
mul.f32 f177, f176, 0fBF167918;
sub.f32 f178, f177, f175;
add.f32 f179, f178, f173;
sub.f32 f180, f173, f178;
mul.f32 f181, f149, 0f3F4F1BBD;
sub.f32 f182, %56, f181;
fma.rn.f32 f183, f151, 0f3E9E377A, f182;
mul.f32 f184, f174, 0f3F167918;
mul.f32 f185, f176, 0f3F737871;
sub.f32 f186, f185, f184;
add.f32 f187, f186, f183;
sub.f32 f188, f183, f186;
add.f32 f189, %70, %110;
add.f32 f190, %57, f189;
add.f32 f191, %84, %97;
add.f32 f192, f191, f190;
add.f32 f193, %72, %112;
add.f32 f194, %59, f193;
add.f32 f195, %85, %99;
add.f32 f196, f195, f194;
fma.rn.f32 f197, f189, 0f3E9E377A, %57;
mul.f32 f198, f191, 0f3F4F1BBD;
sub.f32 f199, f197, f198;
sub.f32 f200, %72, %112;
mul.f32 f201, f200, 0f3F737871;
sub.f32 f202, %85, %99;
mul.f32 f203, f202, 0fBF167918;
sub.f32 f204, f203, f201;
sub.f32 f205, f199, f204;
add.f32 f206, f204, f199;
mul.f32 f207, f189, 0f3F4F1BBD;
sub.f32 f208, %57, f207;
fma.rn.f32 f209, f191, 0f3E9E377A, f208;
mul.f32 f210, f200, 0f3F167918;
mul.f32 f211, f202, 0f3F737871;
sub.f32 f212, f211, f210;
sub.f32 f213, f209, f212;
add.f32 f214, f212, f209;
fma.rn.f32 f215, f193, 0f3E9E377A, %59;
mul.f32 f216, f195, 0f3F4F1BBD;
sub.f32 f217, f215, f216;
sub.f32 f218, %70, %110;
mul.f32 f219, f218, 0f3F737871;
sub.f32 f220, %84, %97;
mul.f32 f221, f220, 0fBF167918;
sub.f32 f222, f221, f219;
add.f32 f223, f222, f217;
sub.f32 f224, f217, f222;
mul.f32 f225, f193, 0f3F4F1BBD;
sub.f32 f226, %59, f225;
fma.rn.f32 f227, f195, 0f3E9E377A, f226;
mul.f32 f228, f218, 0f3F167918;
mul.f32 f229, f220, 0f3F737871;
sub.f32 f230, f229, f228;
add.f32 f231, f230, f227;
sub.f32 f232, f227, f230;
add.f32 f233, %73, %113;
add.f32 f234, %60, f233;
add.f32 f235, %86, %100;
add.f32 f236, f235, f234;
add.f32 f237, %75, %115;
add.f32 f238, %61, f237;
add.f32 f239, %88, %101;
add.f32 f240, f239, f238;
fma.rn.f32 f241, f233, 0f3E9E377A, %60;
mul.f32 f242, f235, 0f3F4F1BBD;
sub.f32 f243, f241, f242;
sub.f32 f244, %75, %115;
mul.f32 f245, f244, 0f3F737871;
sub.f32 f246, %88, %101;
mul.f32 f247, f246, 0fBF167918;
sub.f32 f248, f247, f245;
sub.f32 f249, f243, f248;
add.f32 f250, f248, f243;
mul.f32 f251, f233, 0f3F4F1BBD;
sub.f32 f252, %60, f251;
fma.rn.f32 f253, f235, 0f3E9E377A, f252;
mul.f32 f254, f244, 0f3F167918;
mul.f32 f255, f246, 0f3F737871;
sub.f32 f256, f255, f254;
sub.f32 f257, f253, f256;
add.f32 f258, f256, f253;
fma.rn.f32 f259, f237, 0f3E9E377A, %61;
mul.f32 f260, f239, 0f3F4F1BBD;
sub.f32 f261, f259, f260;
sub.f32 f262, %73, %113;
mul.f32 f263, f262, 0f3F737871;
sub.f32 f264, %86, %100;
mul.f32 f265, f264, 0fBF167918;
sub.f32 f266, f265, f263;
add.f32 f267, f266, f261;
sub.f32 f268, f261, f266;
mul.f32 f269, f237, 0f3F4F1BBD;
sub.f32 f270, %61, f269;
fma.rn.f32 f271, f239, 0f3E9E377A, f270;
mul.f32 f272, f262, 0f3F167918;
mul.f32 f273, f264, 0f3F737871;
sub.f32 f274, f273, f272;
add.f32 f275, f274, f271;
sub.f32 f276, f271, f274;
add.f32 f277, %76, %116;
add.f32 f278, %62, f277;
add.f32 f279, %89, %102;
add.f32 f280, f279, f278;
add.f32 f281, %77, %117;
add.f32 f282, %64, f281;
add.f32 f283, %91, %104;
add.f32 f284, f283, f282;
fma.rn.f32 f285, f277, 0f3E9E377A, %62;
mul.f32 f286, f279, 0f3F4F1BBD;
sub.f32 f287, f285, f286;
sub.f32 f288, %77, %117;
mul.f32 f289, f288, 0f3F737871;
sub.f32 f290, %91, %104;
mul.f32 f291, f290, 0fBF167918;
sub.f32 f292, f291, f289;
sub.f32 f293, f287, f292;
add.f32 f294, f292, f287;
mul.f32 f295, f277, 0f3F4F1BBD;
sub.f32 f296, %62, f295;
fma.rn.f32 f297, f279, 0f3E9E377A, f296;
mul.f32 f298, f288, 0f3F167918;
mul.f32 f299, f290, 0f3F737871;
sub.f32 f300, f299, f298;
sub.f32 f301, f297, f300;
add.f32 f302, f300, f297;
fma.rn.f32 f303, f281, 0f3E9E377A, %64;
mul.f32 f304, f283, 0f3F4F1BBD;
sub.f32 f305, f303, f304;
sub.f32 f306, %76, %116;
mul.f32 f307, f306, 0f3F737871;
sub.f32 f308, %89, %102;
mul.f32 f309, f308, 0fBF167918;
sub.f32 f310, f309, f307;
add.f32 f311, f310, f305;
sub.f32 f312, f305, f310;
mul.f32 f313, f281, 0f3F4F1BBD;
sub.f32 f314, %64, f313;
fma.rn.f32 f315, f283, 0f3E9E377A, f314;
mul.f32 f316, f306, 0f3F167918;
mul.f32 f317, f308, 0f3F737871;
sub.f32 f318, f317, f316;
add.f32 f319, f318, f315;
sub.f32 f320, f315, f318;
mov.u32 r4, %tid.x;
mul.f32 f321, f161, 0f3F77F511;
mul.f32 f322, f179, 0fBE7EA890;
sub.f32 f323, f321, f322;
mul.f32 f324, f179, 0f3F77F511;
fma.rn.f32 f325, f161, 0fBE7EA890, f324;
mul.f32 f326, f205, 0f3F6055A2;
mul.f32 f327, f223, 0fBEF6A86B;
sub.f32 f328, f326, f327;
mul.f32 f329, f223, 0f3F6055A2;
fma.rn.f32 f330, f205, 0fBEF6A86B, f329;
mul.f32 f331, f249, 0f3F3A9DB0;
mul.f32 f332, f267, 0fBF2F3E7B;
sub.f32 f333, f331, f332;
mul.f32 f334, f267, 0f3F3A9DB0;
fma.rn.f32 f335, f249, 0fBF2F3E7B, f334;
mul.f32 f336, f293, 0f3F092BF2;
mul.f32 f337, f311, 0fBF5825E0;
sub.f32 f338, f336, f337;
mul.f32 f339, f311, 0f3F092BF2;
fma.rn.f32 f340, f293, 0fBF5825E0, f339;
mul.f32 f341, f169, 0f3F6055A2;
mul.f32 f342, f187, 0fBEF6A86B;
sub.f32 f343, f341, f342;
mul.f32 f344, f187, 0f3F6055A2;
fma.rn.f32 f345, f169, 0fBEF6A86B, f344;
mul.f32 f346, f213, 0f3F092BF2;
mul.f32 f347, f231, 0fBF5825E0;
sub.f32 f348, f346, f347;
mul.f32 f349, f231, 0f3F092BF2;
fma.rn.f32 f350, f213, 0fBF5825E0, f349;
mul.f32 f351, f257, 0f3D809851;
mul.f32 f352, f275, 0fBF7F7EAE;
sub.f32 f353, f351, f352;
mul.f32 f354, f275, 0f3D809851;
fma.rn.f32 f355, f257, 0fBF7F7EAE, f354;
mul.f32 f356, f301, 0fBED9FFBE;
mul.f32 f357, f319, 0fBF67A2BF;
sub.f32 f358, f356, f357;
mul.f32 f359, f319, 0fBED9FFBE;
fma.rn.f32 f360, f301, 0fBF67A2BF, f359;
mul.f32 f361, f170, 0f3F3A9DB0;
mul.f32 f362, f188, 0fBF2F3E7B;
sub.f32 f363, f361, f362;
mul.f32 f364, f188, 0f3F3A9DB0;
fma.rn.f32 f365, f170, 0fBF2F3E7B, f364;
mul.f32 f366, f214, 0f3D809851;
mul.f32 f367, f232, 0fBF7F7EAE;
sub.f32 f368, f366, f367;
mul.f32 f369, f232, 0f3D809851;
fma.rn.f32 f370, f214, 0fBF7F7EAE, f369;
mul.f32 f371, f258, 0fBF232E38;
mul.f32 f372, f276, 0fBF45405B;
sub.f32 f373, f371, f372;
mul.f32 f374, f276, 0fBF232E38;
fma.rn.f32 f375, f258, 0fBF45405B, f374;
mul.f32 f376, f302, 0fBF7DFB3B;
mul.f32 f377, f320, 0fBE00575B;
sub.f32 f378, f376, f377;
mul.f32 f379, f320, 0fBF7DFB3B;
fma.rn.f32 f380, f302, 0fBE00575B, f379;
mul.f32 f381, f162, 0f3F092BF2;
mul.f32 f382, f180, 0fBF5825E0;
sub.f32 f383, f381, f382;
mul.f32 f384, f180, 0f3F092BF2;
fma.rn.f32 f385, f162, 0fBF5825E0, f384;
mul.f32 f386, f206, 0fBED9FFBE;
mul.f32 f387, f224, 0fBF67A2BF;
sub.f32 f388, f386, f387;
mul.f32 f389, f224, 0fBED9FFBE;
fma.rn.f32 f390, f206, 0fBF67A2BF, f389;
mul.f32 f391, f250, 0fBF7DFB3B;
mul.f32 f392, f268, 0fBE00575B;
sub.f32 f393, f391, f392;
mul.f32 f394, f268, 0fBF7DFB3B;
fma.rn.f32 f395, f250, 0fBE00575B, f394;
mul.f32 f396, f294, 0fBF232E38;
mul.f32 f397, f312, 0f3F45405B;
sub.f32 f398, f396, f397;
mul.f32 f399, f312, 0fBF232E38;
fma.rn.f32 f400, f294, 0f3F45405B, f399;
add.f32 f401, f148, f280;
add.f32 f402, f104, f401;
add.f32 f403, f192, f236;
add.f32 f404, f403, f402;
add.f32 f405, f152, f284;
add.f32 f406, f108, f405;
add.f32 f407, f196, f240;
add.f32 f408, f407, f406;
fma.rn.f32 f409, f401, 0f3E9E377A, f104;
mul.f32 f410, f403, 0f3F4F1BBD;
sub.f32 f411, f409, f410;
sub.f32 f412, f152, f284;
mul.f32 f413, f412, 0f3F737871;
sub.f32 f414, f196, f240;
mul.f32 f415, f414, 0fBF167918;
sub.f32 f416, f415, f413;
sub.f32 f417, f411, f416;
add.f32 f418, f416, f411;
mul.f32 f419, f401, 0f3F4F1BBD;
sub.f32 f420, f104, f419;
fma.rn.f32 f421, f403, 0f3E9E377A, f420;
mul.f32 f422, f412, 0f3F167918;
mul.f32 f423, f414, 0f3F737871;
sub.f32 f424, f423, f422;
sub.f32 f425, f421, f424;
add.f32 f426, f424, f421;
fma.rn.f32 f427, f405, 0f3E9E377A, f108;
mul.f32 f428, f407, 0f3F4F1BBD;
sub.f32 f429, f427, f428;
sub.f32 f430, f148, f280;
mul.f32 f431, f430, 0f3F737871;
sub.f32 f432, f192, f236;
mul.f32 f433, f432, 0fBF167918;
sub.f32 f434, f433, f431;
add.f32 f435, f434, f429;
sub.f32 f436, f429, f434;
mul.f32 f437, f405, 0f3F4F1BBD;
sub.f32 f438, f108, f437;
fma.rn.f32 f439, f407, 0f3E9E377A, f438;
mul.f32 f440, f430, 0f3F167918;
mul.f32 f441, f432, 0f3F737871;
sub.f32 f442, f441, f440;
add.f32 f443, f442, f439;
sub.f32 f444, f439, f442;
add.f32 f445, f323, f338;
add.f32 f446, f117, f445;
add.f32 f447, f328, f333;
add.f32 f448, f447, f446;
add.f32 f449, f325, f340;
add.f32 f450, f135, f449;
add.f32 f451, f330, f335;
add.f32 f452, f451, f450;
fma.rn.f32 f453, f445, 0f3E9E377A, f117;
mul.f32 f454, f447, 0f3F4F1BBD;
sub.f32 f455, f453, f454;
sub.f32 f456, f325, f340;
mul.f32 f457, f456, 0f3F737871;
sub.f32 f458, f330, f335;
mul.f32 f459, f458, 0fBF167918;
sub.f32 f460, f459, f457;
sub.f32 f461, f455, f460;
add.f32 f462, f460, f455;
mul.f32 f463, f445, 0f3F4F1BBD;
sub.f32 f464, f117, f463;
fma.rn.f32 f465, f447, 0f3E9E377A, f464;
mul.f32 f466, f456, 0f3F167918;
mul.f32 f467, f458, 0f3F737871;
sub.f32 f468, f467, f466;
sub.f32 f469, f465, f468;
add.f32 f470, f468, f465;
fma.rn.f32 f471, f449, 0f3E9E377A, f135;
mul.f32 f472, f451, 0f3F4F1BBD;
sub.f32 f473, f471, f472;
sub.f32 f474, f323, f338;
mul.f32 f475, f474, 0f3F737871;
sub.f32 f476, f328, f333;
mul.f32 f477, f476, 0fBF167918;
sub.f32 f478, f477, f475;
add.f32 f479, f478, f473;
sub.f32 f480, f473, f478;
mul.f32 f481, f449, 0f3F4F1BBD;
sub.f32 f482, f135, f481;
fma.rn.f32 f483, f451, 0f3E9E377A, f482;
mul.f32 f484, f474, 0f3F167918;
mul.f32 f485, f476, 0f3F737871;
sub.f32 f486, f485, f484;
add.f32 f487, f486, f483;
sub.f32 f488, f483, f486;
add.f32 f489, f343, f358;
add.f32 f490, f125, f489;
add.f32 f491, f348, f353;
add.f32 f492, f491, f490;
add.f32 f493, f345, f360;
add.f32 f494, f143, f493;
add.f32 f495, f350, f355;
add.f32 f496, f495, f494;
fma.rn.f32 f497, f489, 0f3E9E377A, f125;
mul.f32 f498, f491, 0f3F4F1BBD;
sub.f32 f499, f497, f498;
sub.f32 f500, f345, f360;
mul.f32 f501, f500, 0f3F737871;
sub.f32 f502, f350, f355;
mul.f32 f503, f502, 0fBF167918;
sub.f32 f504, f503, f501;
sub.f32 f505, f499, f504;
add.f32 f506, f504, f499;
mul.f32 f507, f489, 0f3F4F1BBD;
sub.f32 f508, f125, f507;
fma.rn.f32 f509, f491, 0f3E9E377A, f508;
mul.f32 f510, f500, 0f3F167918;
mul.f32 f511, f502, 0f3F737871;
sub.f32 f512, f511, f510;
sub.f32 f513, f509, f512;
add.f32 f514, f512, f509;
fma.rn.f32 f515, f493, 0f3E9E377A, f143;
mul.f32 f516, f495, 0f3F4F1BBD;
sub.f32 f517, f515, f516;
sub.f32 f518, f343, f358;
mul.f32 f519, f518, 0f3F737871;
sub.f32 f520, f348, f353;
mul.f32 f521, f520, 0fBF167918;
sub.f32 f522, f521, f519;
add.f32 f523, f522, f517;
sub.f32 f524, f517, f522;
mul.f32 f525, f493, 0f3F4F1BBD;
sub.f32 f526, f143, f525;
fma.rn.f32 f527, f495, 0f3E9E377A, f526;
mul.f32 f528, f518, 0f3F167918;
mul.f32 f529, f520, 0f3F737871;
sub.f32 f530, f529, f528;
add.f32 f531, f530, f527;
sub.f32 f532, f527, f530;
add.f32 f533, f363, f378;
add.f32 f534, f126, f533;
add.f32 f535, f368, f373;
add.f32 f536, f535, f534;
add.f32 f537, f365, f380;
add.f32 f538, f144, f537;
add.f32 f539, f370, f375;
add.f32 f540, f539, f538;
fma.rn.f32 f541, f533, 0f3E9E377A, f126;
mul.f32 f542, f535, 0f3F4F1BBD;
sub.f32 f543, f541, f542;
sub.f32 f544, f365, f380;
mul.f32 f545, f544, 0f3F737871;
sub.f32 f546, f370, f375;
mul.f32 f547, f546, 0fBF167918;
sub.f32 f548, f547, f545;
sub.f32 f549, f543, f548;
add.f32 f550, f548, f543;
mul.f32 f551, f533, 0f3F4F1BBD;
sub.f32 f552, f126, f551;
fma.rn.f32 f553, f535, 0f3E9E377A, f552;
mul.f32 f554, f544, 0f3F167918;
mul.f32 f555, f546, 0f3F737871;
sub.f32 f556, f555, f554;
sub.f32 f557, f553, f556;
add.f32 f558, f556, f553;
fma.rn.f32 f559, f537, 0f3E9E377A, f144;
mul.f32 f560, f539, 0f3F4F1BBD;
sub.f32 f561, f559, f560;
sub.f32 f562, f363, f378;
mul.f32 f563, f562, 0f3F737871;
sub.f32 f564, f368, f373;
mul.f32 f565, f564, 0fBF167918;
sub.f32 f566, f565, f563;
add.f32 f567, f566, f561;
sub.f32 f568, f561, f566;
mul.f32 f569, f537, 0f3F4F1BBD;
sub.f32 f570, f144, f569;
fma.rn.f32 f571, f539, 0f3E9E377A, f570;
mul.f32 f572, f562, 0f3F167918;
mul.f32 f573, f564, 0f3F737871;
sub.f32 f574, f573, f572;
add.f32 f575, f574, f571;
sub.f32 f576, f571, f574;
add.f32 f577, f383, f398;
add.f32 f578, f118, f577;
add.f32 f579, f388, f393;
add.f32 f580, f579, f578;
add.f32 f581, f385, f400;
add.f32 f582, f136, f581;
add.f32 f583, f390, f395;
add.f32 f584, f583, f582;
fma.rn.f32 f585, f577, 0f3E9E377A, f118;
mul.f32 f586, f579, 0f3F4F1BBD;
sub.f32 f587, f585, f586;
sub.f32 f588, f385, f400;
mul.f32 f589, f588, 0f3F737871;
sub.f32 f590, f390, f395;
mul.f32 f591, f590, 0fBF167918;
sub.f32 f592, f591, f589;
sub.f32 f593, f587, f592;
add.f32 f594, f592, f587;
mul.f32 f595, f577, 0f3F4F1BBD;
sub.f32 f596, f118, f595;
fma.rn.f32 f597, f579, 0f3E9E377A, f596;
mul.f32 f598, f588, 0f3F167918;
mul.f32 f599, f590, 0f3F737871;
sub.f32 f600, f599, f598;
sub.f32 f601, f597, f600;
add.f32 f602, f600, f597;
fma.rn.f32 f603, f581, 0f3E9E377A, f136;
mul.f32 f604, f583, 0f3F4F1BBD;
sub.f32 f605, f603, f604;
sub.f32 f606, f383, f398;
mul.f32 f607, f606, 0f3F737871;
sub.f32 f608, f388, f393;
mul.f32 f609, f608, 0fBF167918;
sub.f32 f610, f609, f607;
add.f32 f611, f610, f605;
sub.f32 f612, f605, f610;
mul.f32 f613, f581, 0f3F4F1BBD;
sub.f32 f614, f136, f613;
fma.rn.f32 f615, f583, 0f3E9E377A, f614;
mul.f32 f616, f606, 0f3F167918;
mul.f32 f617, f608, 0f3F737871;
sub.f32 f618, f617, f616;
add.f32 f619, f618, f615;
sub.f32 f620, f615, f618;
mul.wide.u32 rd2, r4, -858993459;
shr.u64 rd3, rd2, 34;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 5;
sub.s32 r7, r4, r6;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %51;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f621, f622}, [rd6];
mul.f32 f625, f621, f448;
mul.f32 f626, f622, f452;
sub.f32 f627, f625, f626;
mul.f32 f628, f621, f452;
fma.rn.f32 f629, f622, f448, f628;
mul.f32 f630, f621, f621;
mul.f32 f631, f622, f622;
sub.f32 f632, f630, f631;
mul.f32 f633, f622, f621;
fma.rn.f32 f634, f622, f621, f633;
mul.f32 f635, f632, f492;
mul.f32 f636, f634, f496;
sub.f32 f637, f635, f636;
mul.f32 f638, f632, f496;
fma.rn.f32 f639, f634, f492, f638;
mul.f32 f640, f621, f632;
mul.f32 f641, f622, f634;
sub.f32 f642, f640, f641;
mul.f32 f643, f621, f634;
fma.rn.f32 f644, f622, f632, f643;
mul.f32 f645, f642, f536;
mul.f32 f646, f644, f540;
sub.f32 f647, f645, f646;
mul.f32 f648, f642, f540;
fma.rn.f32 f649, f644, f536, f648;
mul.f32 f650, f621, f642;
mul.f32 f651, f622, f644;
sub.f32 f652, f650, f651;
mul.f32 f653, f621, f644;
fma.rn.f32 f654, f622, f642, f653;
mul.f32 f655, f652, f580;
mul.f32 f656, f654, f584;
sub.f32 f657, f655, f656;
mul.f32 f658, f652, f584;
fma.rn.f32 f659, f654, f580, f658;
mul.f32 f660, f621, f652;
mul.f32 f661, f622, f654;
sub.f32 f662, f660, f661;
mul.f32 f663, f621, f654;
fma.rn.f32 f664, f622, f652, f663;
mul.f32 f665, f662, f417;
mul.f32 f666, f664, f435;
sub.f32 f667, f665, f666;
mul.f32 f668, f662, f435;
fma.rn.f32 f669, f664, f417, f668;
mul.f32 f670, f621, f662;
mul.f32 f671, f622, f664;
sub.f32 f672, f670, f671;
mul.f32 f673, f621, f664;
fma.rn.f32 f674, f622, f662, f673;
mul.f32 f675, f672, f461;
mul.f32 f676, f674, f479;
sub.f32 f677, f675, f676;
mul.f32 f678, f672, f479;
fma.rn.f32 f679, f674, f461, f678;
mul.f32 f680, f621, f672;
mul.f32 f681, f622, f674;
sub.f32 f682, f680, f681;
mul.f32 f683, f621, f674;
fma.rn.f32 f684, f622, f672, f683;
mul.f32 f685, f682, f505;
mul.f32 f686, f684, f523;
sub.f32 f687, f685, f686;
mul.f32 f688, f682, f523;
fma.rn.f32 f689, f684, f505, f688;
mul.f32 f690, f621, f682;
mul.f32 f691, f622, f684;
sub.f32 f692, f690, f691;
mul.f32 f693, f621, f684;
fma.rn.f32 f694, f622, f682, f693;
mul.f32 f695, f692, f549;
mul.f32 f696, f694, f567;
sub.f32 f697, f695, f696;
mul.f32 f698, f692, f567;
fma.rn.f32 f699, f694, f549, f698;
mul.f32 f700, f621, f692;
mul.f32 f701, f622, f694;
sub.f32 f702, f700, f701;
mul.f32 f703, f621, f694;
fma.rn.f32 f704, f622, f692, f703;
mul.f32 f705, f702, f593;
mul.f32 f706, f704, f611;
sub.f32 f707, f705, f706;
mul.f32 f708, f702, f611;
fma.rn.f32 f709, f704, f593, f708;
mul.f32 f710, f621, f702;
mul.f32 f711, f622, f704;
sub.f32 f712, f710, f711;
mul.f32 f713, f621, f704;
fma.rn.f32 f714, f622, f702, f713;
mul.f32 f715, f712, f425;
mul.f32 f716, f714, f443;
sub.f32 f717, f715, f716;
mul.f32 f718, f712, f443;
fma.rn.f32 f719, f714, f425, f718;
mul.f32 f720, f621, f712;
mul.f32 f721, f622, f714;
sub.f32 f722, f720, f721;
mul.f32 f723, f621, f714;
fma.rn.f32 f724, f622, f712, f723;
mul.f32 f725, f722, f469;
mul.f32 f726, f724, f487;
sub.f32 f727, f725, f726;
mul.f32 f728, f722, f487;
fma.rn.f32 f729, f724, f469, f728;
mul.f32 f730, f621, f722;
mul.f32 f731, f622, f724;
sub.f32 f732, f730, f731;
mul.f32 f733, f621, f724;
fma.rn.f32 f734, f622, f722, f733;
mul.f32 f735, f732, f513;
mul.f32 f736, f734, f531;
sub.f32 f737, f735, f736;
mul.f32 f738, f732, f531;
fma.rn.f32 f739, f734, f513, f738;
mul.f32 f740, f621, f732;
mul.f32 f741, f622, f734;
sub.f32 f742, f740, f741;
mul.f32 f743, f621, f734;
fma.rn.f32 f744, f622, f732, f743;
mul.f32 f745, f742, f557;
mul.f32 f746, f744, f575;
sub.f32 f747, f745, f746;
mul.f32 f748, f742, f575;
fma.rn.f32 f749, f744, f557, f748;
mul.f32 f750, f621, f742;
mul.f32 f751, f622, f744;
sub.f32 f752, f750, f751;
mul.f32 f753, f621, f744;
fma.rn.f32 f754, f622, f742, f753;
mul.f32 f755, f752, f601;
mul.f32 f756, f754, f619;
sub.f32 f757, f755, f756;
mul.f32 f758, f752, f619;
fma.rn.f32 f759, f754, f601, f758;
mul.f32 f760, f621, f752;
mul.f32 f761, f622, f754;
sub.f32 f762, f760, f761;
mul.f32 f763, f621, f754;
fma.rn.f32 f764, f622, f752, f763;
mul.f32 f765, f762, f426;
mul.f32 f766, f764, f444;
sub.f32 f767, f765, f766;
mul.f32 f768, f762, f444;
fma.rn.f32 f769, f764, f426, f768;
mul.f32 f770, f621, f762;
mul.f32 f771, f622, f764;
sub.f32 f772, f770, f771;
mul.f32 f773, f621, f764;
fma.rn.f32 f774, f622, f762, f773;
mul.f32 f775, f772, f470;
mul.f32 f776, f774, f488;
sub.f32 f777, f775, f776;
mul.f32 f778, f772, f488;
fma.rn.f32 f779, f774, f470, f778;
mul.f32 f780, f621, f772;
mul.f32 f781, f622, f774;
sub.f32 f782, f780, f781;
mul.f32 f783, f621, f774;
fma.rn.f32 f784, f622, f772, f783;
mul.f32 f785, f782, f514;
mul.f32 f786, f784, f532;
sub.f32 f787, f785, f786;
mul.f32 f788, f782, f532;
fma.rn.f32 f789, f784, f514, f788;
mul.f32 f790, f621, f782;
mul.f32 f791, f622, f784;
sub.f32 f792, f790, f791;
mul.f32 f793, f621, f784;
fma.rn.f32 f794, f622, f782, f793;
mul.f32 f795, f792, f558;
mul.f32 f796, f794, f576;
sub.f32 f797, f795, f796;
mul.f32 f798, f792, f576;
fma.rn.f32 f799, f794, f558, f798;
mul.f32 f800, f621, f792;
mul.f32 f801, f622, f794;
sub.f32 f802, f800, f801;
mul.f32 f803, f621, f794;
fma.rn.f32 f804, f622, f792, f803;
mul.f32 f805, f802, f602;
mul.f32 f806, f804, f620;
sub.f32 f807, f805, f806;
mul.f32 f808, f802, f620;
fma.rn.f32 f809, f804, f602, f808;
mul.f32 f810, f621, f802;
mul.f32 f811, f622, f804;
sub.f32 f812, f810, f811;
mul.f32 f813, f621, f804;
fma.rn.f32 f814, f622, f802, f813;
mul.f32 f815, f812, f418;
mul.f32 f816, f814, f436;
sub.f32 f817, f815, f816;
mul.f32 f818, f812, f436;
fma.rn.f32 f819, f814, f418, f818;
mul.f32 f820, f621, f812;
mul.f32 f821, f622, f814;
sub.f32 f822, f820, f821;
mul.f32 f823, f621, f814;
fma.rn.f32 f824, f622, f812, f823;
mul.f32 f825, f822, f462;
mul.f32 f826, f824, f480;
sub.f32 f827, f825, f826;
mul.f32 f828, f822, f480;
fma.rn.f32 f829, f824, f462, f828;
mul.f32 f830, f621, f822;
mul.f32 f831, f622, f824;
sub.f32 f832, f830, f831;
mul.f32 f833, f621, f824;
fma.rn.f32 f834, f622, f822, f833;
mul.f32 f835, f832, f506;
mul.f32 f836, f834, f524;
sub.f32 f837, f835, f836;
mul.f32 f838, f832, f524;
fma.rn.f32 f839, f834, f506, f838;
mul.f32 f840, f621, f832;
mul.f32 f841, f622, f834;
sub.f32 f842, f840, f841;
mul.f32 f843, f621, f834;
fma.rn.f32 f844, f622, f832, f843;
mul.f32 f845, f842, f550;
mul.f32 f846, f844, f568;
sub.f32 f847, f845, f846;
mul.f32 f848, f842, f568;
fma.rn.f32 f849, f844, f550, f848;
mul.f32 f850, f621, f842;
mul.f32 f851, f622, f844;
sub.f32 f852, f850, f851;
mul.f32 f853, f621, f844;
fma.rn.f32 f854, f622, f842, f853;
mul.f32 f855, f852, f594;
mul.f32 f856, f854, f612;
sub.f32 f857, f855, f856;
mul.f32 f858, f852, f612;
fma.rn.f32 f859, f854, f594, f858;
mad.lo.s32 r8, r5, 500, r3;
barrier.sync 0;
mad.lo.s32 r9, r7, 100, r8;
st.shared.f32 [r9], f404;
st.shared.f32 [r9+4], f627;
st.shared.f32 [r9+8], f637;
st.shared.f32 [r9+12], f647;
st.shared.f32 [r9+16], f657;
st.shared.f32 [r9+20], f667;
st.shared.f32 [r9+24], f677;
st.shared.f32 [r9+28], f687;
st.shared.f32 [r9+32], f697;
st.shared.f32 [r9+36], f707;
st.shared.f32 [r9+40], f717;
st.shared.f32 [r9+44], f727;
st.shared.f32 [r9+48], f737;
st.shared.f32 [r9+52], f747;
st.shared.f32 [r9+56], f757;
st.shared.f32 [r9+60], f767;
st.shared.f32 [r9+64], f777;
st.shared.f32 [r9+68], f787;
st.shared.f32 [r9+72], f797;
st.shared.f32 [r9+76], f807;
st.shared.f32 [r9+80], f817;
st.shared.f32 [r9+84], f827;
st.shared.f32 [r9+88], f837;
st.shared.f32 [r9+92], f847;
st.shared.f32 [r9+96], f857;
barrier.sync 0;
mad.lo.s32 r10, r7, -96, r9;
ld.shared.f32 f860, [r10];
ld.shared.f32 f861, [r10+20];
ld.shared.f32 f862, [r10+40];
ld.shared.f32 f863, [r10+60];
ld.shared.f32 f864, [r10+80];
ld.shared.f32 f865, [r10+100];
ld.shared.f32 f866, [r10+120];
ld.shared.f32 f867, [r10+140];
ld.shared.f32 f868, [r10+160];
ld.shared.f32 f869, [r10+180];
ld.shared.f32 f870, [r10+200];
ld.shared.f32 f871, [r10+220];
ld.shared.f32 f872, [r10+240];
ld.shared.f32 f873, [r10+260];
ld.shared.f32 f874, [r10+280];
ld.shared.f32 f875, [r10+300];
ld.shared.f32 f876, [r10+320];
ld.shared.f32 f877, [r10+340];
ld.shared.f32 f878, [r10+360];
ld.shared.f32 f879, [r10+380];
ld.shared.f32 f880, [r10+400];
ld.shared.f32 f881, [r10+420];
ld.shared.f32 f882, [r10+440];
ld.shared.f32 f883, [r10+460];
ld.shared.f32 f884, [r10+480];
barrier.sync 0;
st.shared.f32 [r9], f408;
st.shared.f32 [r9+4], f629;
st.shared.f32 [r9+8], f639;
st.shared.f32 [r9+12], f649;
st.shared.f32 [r9+16], f659;
st.shared.f32 [r9+20], f669;
st.shared.f32 [r9+24], f679;
st.shared.f32 [r9+28], f689;
st.shared.f32 [r9+32], f699;
st.shared.f32 [r9+36], f709;
st.shared.f32 [r9+40], f719;
st.shared.f32 [r9+44], f729;
st.shared.f32 [r9+48], f739;
st.shared.f32 [r9+52], f749;
st.shared.f32 [r9+56], f759;
st.shared.f32 [r9+60], f769;
st.shared.f32 [r9+64], f779;
st.shared.f32 [r9+68], f789;
st.shared.f32 [r9+72], f799;
st.shared.f32 [r9+76], f809;
st.shared.f32 [r9+80], f819;
st.shared.f32 [r9+84], f829;
st.shared.f32 [r9+88], f839;
st.shared.f32 [r9+92], f849;
st.shared.f32 [r9+96], f859;
barrier.sync 0;
ld.shared.f32 f885, [r10];
ld.shared.f32 f886, [r10+20];
ld.shared.f32 f887, [r10+40];
ld.shared.f32 f888, [r10+60];
ld.shared.f32 f889, [r10+80];
ld.shared.f32 f890, [r10+100];
ld.shared.f32 f891, [r10+120];
ld.shared.f32 f892, [r10+140];
ld.shared.f32 f893, [r10+160];
ld.shared.f32 f894, [r10+180];
ld.shared.f32 f895, [r10+200];
ld.shared.f32 f896, [r10+220];
ld.shared.f32 f897, [r10+240];
ld.shared.f32 f898, [r10+260];
ld.shared.f32 f899, [r10+280];
ld.shared.f32 f900, [r10+300];
ld.shared.f32 f901, [r10+320];
ld.shared.f32 f902, [r10+340];
ld.shared.f32 f903, [r10+360];
ld.shared.f32 f904, [r10+380];
ld.shared.f32 f905, [r10+400];
ld.shared.f32 f906, [r10+420];
ld.shared.f32 f907, [r10+440];
ld.shared.f32 f908, [r10+460];
ld.shared.f32 f909, [r10+480];
add.f32 f910, f865, f880;
add.f32 f911, f860, f910;
add.f32 f912, f870, f875;
add.f32 f913, f890, f905;
add.f32 f914, f885, f913;
add.f32 f915, f895, f900;
fma.rn.f32 f916, f910, 0f3E9E377A, f860;
mul.f32 f917, f912, 0f3F4F1BBD;
sub.f32 f918, f916, f917;
sub.f32 f919, f890, f905;
mul.f32 f920, f919, 0f3F737871;
sub.f32 f921, f895, f900;
mul.f32 f922, f921, 0fBF167918;
sub.f32 f923, f922, f920;
mul.f32 f924, f910, 0f3F4F1BBD;
sub.f32 f925, f860, f924;
fma.rn.f32 f926, f912, 0f3E9E377A, f925;
mul.f32 f927, f919, 0f3F167918;
mul.f32 f928, f921, 0f3F737871;
sub.f32 f929, f928, f927;
fma.rn.f32 f930, f913, 0f3E9E377A, f885;
mul.f32 f931, f915, 0f3F4F1BBD;
sub.f32 f932, f930, f931;
sub.f32 f933, f865, f880;
mul.f32 f934, f933, 0f3F737871;
sub.f32 f935, f870, f875;
mul.f32 f936, f935, 0fBF167918;
sub.f32 f937, f936, f934;
mul.f32 f938, f913, 0f3F4F1BBD;
sub.f32 f939, f885, f938;
fma.rn.f32 f940, f915, 0f3E9E377A, f939;
mul.f32 f941, f933, 0f3F167918;
mul.f32 f942, f935, 0f3F737871;
sub.f32 f943, f942, f941;
add.f32 f944, f866, f881;
add.f32 f945, f861, f944;
add.f32 f946, f871, f876;
add.f32 f947, f891, f906;
add.f32 f948, f886, f947;
add.f32 f949, f896, f901;
fma.rn.f32 f950, f944, 0f3E9E377A, f861;
mul.f32 f951, f946, 0f3F4F1BBD;
sub.f32 f952, f950, f951;
sub.f32 f953, f891, f906;
mul.f32 f954, f953, 0f3F737871;
sub.f32 f955, f896, f901;
mul.f32 f956, f955, 0fBF167918;
sub.f32 f957, f956, f954;
mul.f32 f958, f944, 0f3F4F1BBD;
sub.f32 f959, f861, f958;
fma.rn.f32 f960, f946, 0f3E9E377A, f959;
mul.f32 f961, f953, 0f3F167918;
mul.f32 f962, f955, 0f3F737871;
sub.f32 f963, f962, f961;
fma.rn.f32 f964, f947, 0f3E9E377A, f886;
mul.f32 f965, f949, 0f3F4F1BBD;
sub.f32 f966, f964, f965;
sub.f32 f967, f866, f881;
mul.f32 f968, f967, 0f3F737871;
sub.f32 f969, f871, f876;
mul.f32 f970, f969, 0fBF167918;
sub.f32 f971, f970, f968;
mul.f32 f972, f947, 0f3F4F1BBD;
sub.f32 f973, f886, f972;
fma.rn.f32 f974, f949, 0f3E9E377A, f973;
mul.f32 f975, f967, 0f3F167918;
mul.f32 f976, f969, 0f3F737871;
sub.f32 f977, f976, f975;
add.f32 f978, f867, f882;
add.f32 f979, f862, f978;
add.f32 f980, f872, f877;
add.f32 f981, f892, f907;
add.f32 f982, f887, f981;
add.f32 f983, f897, f902;
fma.rn.f32 f984, f978, 0f3E9E377A, f862;
mul.f32 f985, f980, 0f3F4F1BBD;
sub.f32 f986, f984, f985;
sub.f32 f987, f892, f907;
mul.f32 f988, f987, 0f3F737871;
sub.f32 f989, f897, f902;
mul.f32 f990, f989, 0fBF167918;
sub.f32 f991, f990, f988;
mul.f32 f992, f978, 0f3F4F1BBD;
sub.f32 f993, f862, f992;
fma.rn.f32 f994, f980, 0f3E9E377A, f993;
mul.f32 f995, f987, 0f3F167918;
mul.f32 f996, f989, 0f3F737871;
sub.f32 f997, f996, f995;
fma.rn.f32 f998, f981, 0f3E9E377A, f887;
mul.f32 f999, f983, 0f3F4F1BBD;
sub.f32 f1000, f998, f999;
sub.f32 f1001, f867, f882;
mul.f32 f1002, f1001, 0f3F737871;
sub.f32 f1003, f872, f877;
mul.f32 f1004, f1003, 0fBF167918;
sub.f32 f1005, f1004, f1002;
mul.f32 f1006, f981, 0f3F4F1BBD;
sub.f32 f1007, f887, f1006;
fma.rn.f32 f1008, f983, 0f3E9E377A, f1007;
mul.f32 f1009, f1001, 0f3F167918;
mul.f32 f1010, f1003, 0f3F737871;
sub.f32 f1011, f1010, f1009;
add.f32 f1012, f868, f883;
add.f32 f1013, f863, f1012;
add.f32 f1014, f873, f878;
add.f32 f1015, f893, f908;
add.f32 f1016, f888, f1015;
add.f32 f1017, f898, f903;
fma.rn.f32 f1018, f1012, 0f3E9E377A, f863;
mul.f32 f1019, f1014, 0f3F4F1BBD;
sub.f32 f1020, f1018, f1019;
sub.f32 f1021, f893, f908;
mul.f32 f1022, f1021, 0f3F737871;
sub.f32 f1023, f898, f903;
mul.f32 f1024, f1023, 0fBF167918;
sub.f32 f1025, f1024, f1022;
mul.f32 f1026, f1012, 0f3F4F1BBD;
sub.f32 f1027, f863, f1026;
fma.rn.f32 f1028, f1014, 0f3E9E377A, f1027;
mul.f32 f1029, f1021, 0f3F167918;
mul.f32 f1030, f1023, 0f3F737871;
sub.f32 f1031, f1030, f1029;
fma.rn.f32 f1032, f1015, 0f3E9E377A, f888;
mul.f32 f1033, f1017, 0f3F4F1BBD;
sub.f32 f1034, f1032, f1033;
sub.f32 f1035, f868, f883;
mul.f32 f1036, f1035, 0f3F737871;
sub.f32 f1037, f873, f878;
mul.f32 f1038, f1037, 0fBF167918;
sub.f32 f1039, f1038, f1036;
mul.f32 f1040, f1015, 0f3F4F1BBD;
sub.f32 f1041, f888, f1040;
fma.rn.f32 f1042, f1017, 0f3E9E377A, f1041;
mul.f32 f1043, f1035, 0f3F167918;
mul.f32 f1044, f1037, 0f3F737871;
sub.f32 f1045, f1044, f1043;
add.f32 f1046, f869, f884;
add.f32 f1047, f864, f1046;
add.f32 f1048, f874, f879;
add.f32 f1049, f894, f909;
add.f32 f1050, f889, f1049;
add.f32 f1051, f899, f904;
fma.rn.f32 f1052, f1046, 0f3E9E377A, f864;
mul.f32 f1053, f1048, 0f3F4F1BBD;
sub.f32 f1054, f1052, f1053;
sub.f32 f1055, f894, f909;
mul.f32 f1056, f1055, 0f3F737871;
sub.f32 f1057, f899, f904;
mul.f32 f1058, f1057, 0fBF167918;
sub.f32 f1059, f1058, f1056;
mul.f32 f1060, f1046, 0f3F4F1BBD;
sub.f32 f1061, f864, f1060;
fma.rn.f32 f1062, f1048, 0f3E9E377A, f1061;
mul.f32 f1063, f1055, 0f3F167918;
mul.f32 f1064, f1057, 0f3F737871;
sub.f32 f1065, f1064, f1063;
fma.rn.f32 f1066, f1049, 0f3E9E377A, f889;
mul.f32 f1067, f1051, 0f3F4F1BBD;
sub.f32 f1068, f1066, f1067;
sub.f32 f1069, f869, f884;
mul.f32 f1070, f1069, 0f3F737871;
sub.f32 f1071, f874, f879;
mul.f32 f1072, f1071, 0fBF167918;
sub.f32 f1073, f1072, f1070;
mul.f32 f1074, f1049, 0f3F4F1BBD;
sub.f32 f1075, f889, f1074;
fma.rn.f32 f1076, f1051, 0f3E9E377A, f1075;
mul.f32 f1077, f1069, 0f3F167918;
mul.f32 f1078, f1071, 0f3F737871;
sub.f32 f1079, f1078, f1077;
add.f32 %0, f912, f911;
add.f32 %1, f915, f914;
add.f32 %2, f946, f945;
add.f32 %3, f949, f948;
add.f32 %4, f980, f979;
add.f32 %5, f983, f982;
add.f32 %6, f1014, f1013;
add.f32 %7, f1017, f1016;
add.f32 %8, f1048, f1047;
add.f32 %9, f1051, f1050;
add.f32 %11, f937, f932;
sub.f32 %10, f918, f923;
add.f32 %13, f971, f966;
sub.f32 %12, f952, f957;
add.f32 %15, f1005, f1000;
sub.f32 %14, f986, f991;
add.f32 %17, f1039, f1034;
sub.f32 %16, f1020, f1025;
add.f32 %19, f1073, f1068;
sub.f32 %18, f1054, f1059;
sub.f32 %20, f926, f929;
add.f32 %21, f943, f940;
sub.f32 %22, f960, f963;
add.f32 %23, f977, f974;
sub.f32 %24, f994, f997;
add.f32 %25, f1011, f1008;
sub.f32 %26, f1028, f1031;
add.f32 %27, f1045, f1042;
sub.f32 %28, f1062, f1065;
add.f32 %29, f1079, f1076;
add.f32 %30, f929, f926;
sub.f32 %31, f940, f943;
add.f32 %32, f963, f960;
sub.f32 %33, f974, f977;
add.f32 %34, f997, f994;
sub.f32 %35, f1008, f1011;
add.f32 %36, f1031, f1028;
sub.f32 %37, f1042, f1045;
add.f32 %38, f1065, f1062;
sub.f32 %39, f1076, f1079;
sub.f32 %41, f932, f937;
add.f32 %40, f923, f918;
sub.f32 %43, f966, f971;
add.f32 %42, f957, f952;
sub.f32 %45, f1000, f1005;
add.f32 %44, f991, f986;
sub.f32 %47, f1034, f1039;
add.f32 %46, f1025, f1020;
sub.f32 %49, f1068, f1073;
add.f32 %48, f1059, f1054;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<160, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<271>;
.reg .b32 r<18>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %10;
mad.lo.s32 r3, r1, 1000, r2;
mov.u32 r4, %tid.x;
add.f32 f21, %15, %23;
add.f32 f22, %13, f21;
add.f32 f23, %18, %21;
add.f32 f24, %17, %24;
add.f32 f25, %14, f24;
add.f32 f26, %20, %22;
fma.rn.f32 f27, f21, 0f3E9E377A, %13;
mul.f32 f28, f23, 0f3F4F1BBD;
sub.f32 f29, f27, f28;
sub.f32 f30, %17, %24;
mul.f32 f31, f30, 0f3F737871;
sub.f32 f32, %20, %22;
mul.f32 f33, f32, 0fBF167918;
sub.f32 f34, f33, f31;
sub.f32 f35, f29, f34;
add.f32 f36, f34, f29;
mul.f32 f37, f21, 0f3F4F1BBD;
sub.f32 f38, %13, f37;
fma.rn.f32 f39, f23, 0f3E9E377A, f38;
mul.f32 f40, f30, 0f3F167918;
mul.f32 f41, f32, 0f3F737871;
sub.f32 f42, f41, f40;
sub.f32 f43, f39, f42;
add.f32 f44, f42, f39;
fma.rn.f32 f45, f24, 0f3E9E377A, %14;
mul.f32 f46, f26, 0f3F4F1BBD;
sub.f32 f47, f45, f46;
sub.f32 f48, %15, %23;
mul.f32 f49, f48, 0f3F737871;
sub.f32 f50, %18, %21;
mul.f32 f51, f50, 0fBF167918;
sub.f32 f52, f51, f49;
add.f32 f53, f52, f47;
sub.f32 f54, f47, f52;
mul.f32 f55, f24, 0f3F4F1BBD;
sub.f32 f56, %14, f55;
fma.rn.f32 f57, f26, 0f3E9E377A, f56;
mul.f32 f58, f48, 0f3F167918;
mul.f32 f59, f50, 0f3F737871;
sub.f32 f60, f59, f58;
add.f32 f61, f60, f57;
sub.f32 f62, f57, f60;
mul.wide.u32 rd2, r4, 1374389535;
shr.u64 rd3, rd2, 35;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 25;
sub.s32 r7, r4, r6;
mad.lo.s32 r8, r5, 1000, r3;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %11;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f63, f64}, [rd6];
mul.f32 f67, f63, f35;
mul.f32 f68, f64, f53;
mul.f32 f69, f63, f53;
mul.f32 f70, f63, f63;
mul.f32 f71, f64, f64;
sub.f32 f72, f70, f71;
mul.f32 f73, f64, f63;
fma.rn.f32 f74, f64, f63, f73;
mul.f32 f75, f72, f43;
mul.f32 f76, f74, f61;
mul.f32 f77, f72, f61;
mul.f32 f78, f63, f72;
mul.f32 f79, f64, f74;
sub.f32 f80, f78, f79;
mul.f32 f81, f63, f74;
fma.rn.f32 f82, f64, f72, f81;
mul.f32 f83, f80, f44;
mul.f32 f84, f82, f62;
mul.f32 f85, f80, f62;
mul.f32 f86, f63, f80;
mul.f32 f87, f64, f82;
sub.f32 f88, f86, f87;
mul.f32 f89, f63, f82;
fma.rn.f32 f90, f64, f80, f89;
mul.f32 f91, f88, f36;
mul.f32 f92, f90, f54;
mul.f32 f93, f88, f54;
barrier.sync 0;
mad.lo.s32 r9, r7, 40, r8;
add.f32 f94, f26, f25;
add.f32 f95, f23, f22;
st.shared.v2.f32 [r9], {f95, f94};
fma.rn.f32 f96, f64, f35, f69;
sub.f32 f97, f67, f68;
st.shared.v2.f32 [r9+8], {f97, f96};
fma.rn.f32 f98, f74, f43, f77;
sub.f32 f99, f75, f76;
st.shared.v2.f32 [r9+16], {f99, f98};
sub.f32 f100, f83, f84;
fma.rn.f32 f101, f82, f44, f85;
st.shared.v2.f32 [r9+24], {f100, f101};
fma.rn.f32 f102, f90, f36, f93;
sub.f32 f103, f91, f92;
st.shared.v2.f32 [r9+32], {f103, f102};
barrier.sync 0;
shl.b32 r10, r7, 5;
sub.s32 r11, r9, r10;
ld.shared.v2.f32 {f104, f105}, [r11];
ld.shared.v2.f32 {f108, f109}, [r11+200];
ld.shared.v2.f32 {f112, f113}, [r11+400];
ld.shared.v2.f32 {f116, f117}, [r11+600];
ld.shared.v2.f32 {f120, f121}, [r11+800];
add.f32 f124, f108, f120;
add.f32 f125, f104, f124;
add.f32 f126, f112, f116;
add.f32 f127, f109, f121;
add.f32 f128, f105, f127;
add.f32 f129, f113, f117;
fma.rn.f32 f130, f124, 0f3E9E377A, f104;
mul.f32 f131, f126, 0f3F4F1BBD;
sub.f32 f132, f130, f131;
sub.f32 f133, f109, f121;
mul.f32 f134, f133, 0f3F737871;
sub.f32 f135, f113, f117;
mul.f32 f136, f135, 0fBF167918;
sub.f32 f137, f136, f134;
sub.f32 f138, f132, f137;
add.f32 f139, f137, f132;
mul.f32 f140, f124, 0f3F4F1BBD;
sub.f32 f141, f104, f140;
fma.rn.f32 f142, f126, 0f3E9E377A, f141;
mul.f32 f143, f133, 0f3F167918;
mul.f32 f144, f135, 0f3F737871;
sub.f32 f145, f144, f143;
sub.f32 f146, f142, f145;
add.f32 f147, f145, f142;
fma.rn.f32 f148, f127, 0f3E9E377A, f105;
mul.f32 f149, f129, 0f3F4F1BBD;
sub.f32 f150, f148, f149;
sub.f32 f151, f108, f120;
mul.f32 f152, f151, 0f3F737871;
sub.f32 f153, f112, f116;
mul.f32 f154, f153, 0fBF167918;
sub.f32 f155, f154, f152;
add.f32 f156, f155, f150;
sub.f32 f157, f150, f155;
mul.f32 f158, f127, 0f3F4F1BBD;
sub.f32 f159, f105, f158;
fma.rn.f32 f160, f129, 0f3E9E377A, f159;
mul.f32 f161, f151, 0f3F167918;
mul.f32 f162, f153, 0f3F737871;
sub.f32 f163, f162, f161;
add.f32 f164, f163, f160;
sub.f32 f165, f160, f163;
mul.wide.u32 rd7, r7, -858993459;
shr.u64 rd8, rd7, 34;
cvt.u32.u64 r12, rd8;
mul.lo.s32 r13, r12, 5;
sub.s32 r14, r7, r13;
mul.wide.u32 rd9, r12, 8;
mov.u64 rd10, %12;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f166, f167}, [rd11];
mul.f32 f170, f166, f138;
mul.f32 f171, f167, f156;
mul.f32 f172, f166, f156;
mul.f32 f173, f166, f166;
mul.f32 f174, f167, f167;
sub.f32 f175, f173, f174;
mul.f32 f176, f167, f166;
fma.rn.f32 f177, f167, f166, f176;
mul.f32 f178, f175, f146;
mul.f32 f179, f177, f164;
mul.f32 f180, f175, f164;
mul.f32 f181, f166, f175;
mul.f32 f182, f167, f177;
sub.f32 f183, f181, f182;
mul.f32 f184, f166, f177;
fma.rn.f32 f185, f167, f175, f184;
mul.f32 f186, f183, f147;
mul.f32 f187, f185, f165;
mul.f32 f188, f183, f165;
mul.f32 f189, f166, f183;
mul.f32 f190, f167, f185;
sub.f32 f191, f189, f190;
mul.f32 f192, f166, f185;
fma.rn.f32 f193, f167, f183, f192;
mul.f32 f194, f191, f139;
mul.f32 f195, f193, f157;
mul.f32 f196, f191, f157;
shl.b32 r15, r14, 3;
add.s32 r16, r8, r15;
barrier.sync 0;
mad.lo.s32 r17, r12, 200, r16;
add.f32 f197, f129, f128;
add.f32 f198, f126, f125;
st.shared.v2.f32 [r17], {f198, f197};
fma.rn.f32 f199, f167, f138, f172;
sub.f32 f200, f170, f171;
st.shared.v2.f32 [r17+40], {f200, f199};
fma.rn.f32 f201, f177, f146, f180;
sub.f32 f202, f178, f179;
st.shared.v2.f32 [r17+80], {f202, f201};
fma.rn.f32 f203, f185, f147, f188;
sub.f32 f204, f186, f187;
st.shared.v2.f32 [r17+120], {f204, f203};
fma.rn.f32 f205, f193, f139, f196;
sub.f32 f206, f194, f195;
st.shared.v2.f32 [r17+160], {f206, f205};
barrier.sync 0;
ld.shared.v2.f32 {f207, f208}, [r11];
ld.shared.v2.f32 {f211, f212}, [r11+200];
ld.shared.v2.f32 {f215, f216}, [r11+400];
ld.shared.v2.f32 {f219, f220}, [r11+600];
ld.shared.v2.f32 {f223, f224}, [r11+800];
add.f32 f227, f211, f223;
add.f32 f228, f207, f227;
add.f32 f229, f215, f219;
add.f32 f230, f212, f224;
add.f32 f231, f208, f230;
add.f32 f232, f216, f220;
fma.rn.f32 f233, f227, 0f3E9E377A, f207;
mul.f32 f234, f229, 0f3F4F1BBD;
sub.f32 f235, f233, f234;
sub.f32 f236, f212, f224;
mul.f32 f237, f236, 0f3F737871;
sub.f32 f238, f216, f220;
mul.f32 f239, f238, 0fBF167918;
sub.f32 f240, f239, f237;
mul.f32 f241, f227, 0f3F4F1BBD;
sub.f32 f242, f207, f241;
fma.rn.f32 f243, f229, 0f3E9E377A, f242;
mul.f32 f244, f236, 0f3F167918;
mul.f32 f245, f238, 0f3F737871;
sub.f32 f246, f245, f244;
fma.rn.f32 f247, f230, 0f3E9E377A, f208;
mul.f32 f248, f232, 0f3F4F1BBD;
sub.f32 f249, f247, f248;
sub.f32 f250, f211, f223;
mul.f32 f251, f250, 0f3F737871;
sub.f32 f252, f215, f219;
mul.f32 f253, f252, 0fBF167918;
sub.f32 f254, f253, f251;
mul.f32 f255, f230, 0f3F4F1BBD;
sub.f32 f256, f208, f255;
fma.rn.f32 f257, f232, 0f3E9E377A, f256;
mul.f32 f258, f250, 0f3F167918;
mul.f32 f259, f252, 0f3F737871;
sub.f32 f260, f259, f258;
add.f32 %1, f232, f231;
add.f32 %0, f229, f228;
add.f32 %3, f254, f249;
sub.f32 %2, f235, f240;
add.f32 %5, f260, f257;
sub.f32 %4, f243, f246;
sub.f32 %7, f257, f260;
add.f32 %6, f246, f243;
sub.f32 %9, f249, f254;
add.f32 %8, f240, f235;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<161, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<251>;
.reg .b32 r<18>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %10;
mad.lo.s32 r3, r1, 500, r2;
mov.u32 r4, %tid.x;
add.f32 f21, %15, %23;
add.f32 f22, %13, f21;
add.f32 f23, %18, %21;
add.f32 f24, f23, f22;
add.f32 f25, %17, %24;
add.f32 f26, %14, f25;
add.f32 f27, %20, %22;
add.f32 f28, f27, f26;
fma.rn.f32 f29, f21, 0f3E9E377A, %13;
mul.f32 f30, f23, 0f3F4F1BBD;
sub.f32 f31, f29, f30;
sub.f32 f32, %17, %24;
mul.f32 f33, f32, 0f3F737871;
sub.f32 f34, %20, %22;
mul.f32 f35, f34, 0fBF167918;
sub.f32 f36, f35, f33;
sub.f32 f37, f31, f36;
add.f32 f38, f36, f31;
mul.f32 f39, f21, 0f3F4F1BBD;
sub.f32 f40, %13, f39;
fma.rn.f32 f41, f23, 0f3E9E377A, f40;
mul.f32 f42, f32, 0f3F167918;
mul.f32 f43, f34, 0f3F737871;
sub.f32 f44, f43, f42;
sub.f32 f45, f41, f44;
add.f32 f46, f44, f41;
fma.rn.f32 f47, f25, 0f3E9E377A, %14;
mul.f32 f48, f27, 0f3F4F1BBD;
sub.f32 f49, f47, f48;
sub.f32 f50, %15, %23;
mul.f32 f51, f50, 0f3F737871;
sub.f32 f52, %18, %21;
mul.f32 f53, f52, 0fBF167918;
sub.f32 f54, f53, f51;
add.f32 f55, f54, f49;
sub.f32 f56, f49, f54;
mul.f32 f57, f25, 0f3F4F1BBD;
sub.f32 f58, %14, f57;
fma.rn.f32 f59, f27, 0f3E9E377A, f58;
mul.f32 f60, f50, 0f3F167918;
mul.f32 f61, f52, 0f3F737871;
sub.f32 f62, f61, f60;
add.f32 f63, f62, f59;
sub.f32 f64, f59, f62;
mul.wide.u32 rd2, r4, 1374389535;
shr.u64 rd3, rd2, 35;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 25;
sub.s32 r7, r4, r6;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %11;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f65, f66}, [rd6];
mul.f32 f69, f65, f37;
mul.f32 f70, f66, f55;
sub.f32 f71, f69, f70;
mul.f32 f72, f65, f55;
fma.rn.f32 f73, f66, f37, f72;
mul.f32 f74, f65, f65;
mul.f32 f75, f66, f66;
sub.f32 f76, f74, f75;
mul.f32 f77, f66, f65;
fma.rn.f32 f78, f66, f65, f77;
mul.f32 f79, f76, f45;
mul.f32 f80, f78, f63;
sub.f32 f81, f79, f80;
mul.f32 f82, f76, f63;
fma.rn.f32 f83, f78, f45, f82;
mul.f32 f84, f65, f76;
mul.f32 f85, f66, f78;
sub.f32 f86, f84, f85;
mul.f32 f87, f65, f78;
fma.rn.f32 f88, f66, f76, f87;
mul.f32 f89, f86, f46;
mul.f32 f90, f88, f64;
sub.f32 f91, f89, f90;
mul.f32 f92, f86, f64;
fma.rn.f32 f93, f88, f46, f92;
mul.f32 f94, f65, f86;
mul.f32 f95, f66, f88;
sub.f32 f96, f94, f95;
mul.f32 f97, f65, f88;
fma.rn.f32 f98, f66, f86, f97;
mul.f32 f99, f96, f38;
mul.f32 f100, f98, f56;
sub.f32 f101, f99, f100;
mul.f32 f102, f96, f56;
fma.rn.f32 f103, f98, f38, f102;
mad.lo.s32 r8, r5, 500, r3;
barrier.sync 0;
mad.lo.s32 r9, r7, 20, r8;
st.shared.f32 [r9], f24;
st.shared.f32 [r9+4], f71;
st.shared.f32 [r9+8], f81;
st.shared.f32 [r9+12], f91;
st.shared.f32 [r9+16], f101;
barrier.sync 0;
shl.b32 r10, r7, 4;
sub.s32 r11, r9, r10;
ld.shared.f32 f104, [r11];
ld.shared.f32 f105, [r11+100];
ld.shared.f32 f106, [r11+200];
ld.shared.f32 f107, [r11+300];
ld.shared.f32 f108, [r11+400];
barrier.sync 0;
st.shared.f32 [r9], f28;
st.shared.f32 [r9+4], f73;
st.shared.f32 [r9+8], f83;
st.shared.f32 [r9+12], f93;
st.shared.f32 [r9+16], f103;
barrier.sync 0;
ld.shared.f32 f109, [r11];
ld.shared.f32 f110, [r11+100];
ld.shared.f32 f111, [r11+200];
ld.shared.f32 f112, [r11+300];
ld.shared.f32 f113, [r11+400];
add.f32 f114, f105, f108;
add.f32 f115, f104, f114;
add.f32 f116, f106, f107;
add.f32 f117, f116, f115;
add.f32 f118, f110, f113;
add.f32 f119, f109, f118;
add.f32 f120, f111, f112;
add.f32 f121, f120, f119;
fma.rn.f32 f122, f114, 0f3E9E377A, f104;
mul.f32 f123, f116, 0f3F4F1BBD;
sub.f32 f124, f122, f123;
sub.f32 f125, f110, f113;
mul.f32 f126, f125, 0f3F737871;
sub.f32 f127, f111, f112;
mul.f32 f128, f127, 0fBF167918;
sub.f32 f129, f128, f126;
sub.f32 f130, f124, f129;
add.f32 f131, f129, f124;
mul.f32 f132, f114, 0f3F4F1BBD;
sub.f32 f133, f104, f132;
fma.rn.f32 f134, f116, 0f3E9E377A, f133;
mul.f32 f135, f125, 0f3F167918;
mul.f32 f136, f127, 0f3F737871;
sub.f32 f137, f136, f135;
sub.f32 f138, f134, f137;
add.f32 f139, f137, f134;
fma.rn.f32 f140, f118, 0f3E9E377A, f109;
mul.f32 f141, f120, 0f3F4F1BBD;
sub.f32 f142, f140, f141;
sub.f32 f143, f105, f108;
mul.f32 f144, f143, 0f3F737871;
sub.f32 f145, f106, f107;
mul.f32 f146, f145, 0fBF167918;
sub.f32 f147, f146, f144;
add.f32 f148, f147, f142;
sub.f32 f149, f142, f147;
mul.f32 f150, f118, 0f3F4F1BBD;
sub.f32 f151, f109, f150;
fma.rn.f32 f152, f120, 0f3E9E377A, f151;
mul.f32 f153, f143, 0f3F167918;
mul.f32 f154, f145, 0f3F737871;
sub.f32 f155, f154, f153;
add.f32 f156, f155, f152;
sub.f32 f157, f152, f155;
mul.wide.u32 rd7, r7, -858993459;
shr.u64 rd8, rd7, 34;
cvt.u32.u64 r12, rd8;
mul.lo.s32 r13, r12, 5;
sub.s32 r14, r7, r13;
mul.wide.u32 rd9, r12, 8;
mov.u64 rd10, %12;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f158, f159}, [rd11];
mul.f32 f162, f158, f130;
mul.f32 f163, f159, f148;
sub.f32 f164, f162, f163;
mul.f32 f165, f158, f148;
fma.rn.f32 f166, f159, f130, f165;
mul.f32 f167, f158, f158;
mul.f32 f168, f159, f159;
sub.f32 f169, f167, f168;
mul.f32 f170, f159, f158;
fma.rn.f32 f171, f159, f158, f170;
mul.f32 f172, f169, f138;
mul.f32 f173, f171, f156;
sub.f32 f174, f172, f173;
mul.f32 f175, f169, f156;
fma.rn.f32 f176, f171, f138, f175;
mul.f32 f177, f158, f169;
mul.f32 f178, f159, f171;
sub.f32 f179, f177, f178;
mul.f32 f180, f158, f171;
fma.rn.f32 f181, f159, f169, f180;
mul.f32 f182, f179, f139;
mul.f32 f183, f181, f157;
sub.f32 f184, f182, f183;
mul.f32 f185, f179, f157;
fma.rn.f32 f186, f181, f139, f185;
mul.f32 f187, f158, f179;
mul.f32 f188, f159, f181;
sub.f32 f189, f187, f188;
mul.f32 f190, f158, f181;
fma.rn.f32 f191, f159, f179, f190;
mul.f32 f192, f189, f131;
mul.f32 f193, f191, f149;
sub.f32 f194, f192, f193;
mul.f32 f195, f189, f149;
fma.rn.f32 f196, f191, f131, f195;
shl.b32 r15, r14, 2;
add.s32 r16, r8, r15;
barrier.sync 0;
mad.lo.s32 r17, r12, 100, r16;
st.shared.f32 [r17], f117;
st.shared.f32 [r17+20], f164;
st.shared.f32 [r17+40], f174;
st.shared.f32 [r17+60], f184;
st.shared.f32 [r17+80], f194;
barrier.sync 0;
ld.shared.f32 f197, [r11];
ld.shared.f32 f198, [r11+100];
ld.shared.f32 f199, [r11+200];
ld.shared.f32 f200, [r11+300];
ld.shared.f32 f201, [r11+400];
barrier.sync 0;
st.shared.f32 [r17], f121;
st.shared.f32 [r17+20], f166;
st.shared.f32 [r17+40], f176;
st.shared.f32 [r17+60], f186;
st.shared.f32 [r17+80], f196;
barrier.sync 0;
ld.shared.f32 f202, [r11];
ld.shared.f32 f203, [r11+100];
ld.shared.f32 f204, [r11+200];
ld.shared.f32 f205, [r11+300];
ld.shared.f32 f206, [r11+400];
add.f32 f207, f198, f201;
add.f32 f208, f197, f207;
add.f32 f209, f199, f200;
add.f32 f210, f203, f206;
add.f32 f211, f202, f210;
add.f32 f212, f204, f205;
fma.rn.f32 f213, f207, 0f3E9E377A, f197;
mul.f32 f214, f209, 0f3F4F1BBD;
sub.f32 f215, f213, f214;
sub.f32 f216, f203, f206;
mul.f32 f217, f216, 0f3F737871;
sub.f32 f218, f204, f205;
mul.f32 f219, f218, 0fBF167918;
sub.f32 f220, f219, f217;
mul.f32 f221, f207, 0f3F4F1BBD;
sub.f32 f222, f197, f221;
fma.rn.f32 f223, f209, 0f3E9E377A, f222;
mul.f32 f224, f216, 0f3F167918;
mul.f32 f225, f218, 0f3F737871;
sub.f32 f226, f225, f224;
fma.rn.f32 f227, f210, 0f3E9E377A, f202;
mul.f32 f228, f212, 0f3F4F1BBD;
sub.f32 f229, f227, f228;
sub.f32 f230, f198, f201;
mul.f32 f231, f230, 0f3F737871;
sub.f32 f232, f199, f200;
mul.f32 f233, f232, 0fBF167918;
sub.f32 f234, f233, f231;
mul.f32 f235, f210, 0f3F4F1BBD;
sub.f32 f236, f202, f235;
fma.rn.f32 f237, f212, 0f3E9E377A, f236;
mul.f32 f238, f230, 0f3F167918;
mul.f32 f239, f232, 0f3F737871;
sub.f32 f240, f239, f238;
add.f32 %0, f209, f208;
add.f32 %1, f212, f211;
add.f32 %3, f234, f229;
sub.f32 %2, f215, f220;
sub.f32 %4, f223, f226;
add.f32 %5, f240, f237;
add.f32 %6, f226, f223;
sub.f32 %7, f237, f240;
sub.f32 %9, f229, f234;
add.f32 %8, f220, f215;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y));
};


#endif
